diff options
Diffstat (limited to 'net')
496 files changed, 22095 insertions, 13952 deletions
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index faf65baed..34e44c0c0 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -20,7 +20,7 @@ int lowpan_register_netdevice(struct net_device *dev, enum lowpan_lltypes lltype) { - int ret; + int i, ret; dev->addr_len = EUI64_ADDR_LEN; dev->type = ARPHRD_6LOWPAN; @@ -29,6 +29,10 @@ int lowpan_register_netdevice(struct net_device *dev, lowpan_priv(dev)->lltype = lltype; + spin_lock_init(&lowpan_priv(dev)->ctx.lock); + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) + lowpan_priv(dev)->ctx.table[i].id = i; + ret = register_netdevice(dev); if (ret < 0) return ret; @@ -68,6 +72,32 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); +static int lowpan_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + int i; + + if (dev->type != ARPHRD_6LOWPAN) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_DOWN: + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) + clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, + &lowpan_priv(dev)->ctx.table[i].flags); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block lowpan_notifier = { + .notifier_call = lowpan_event, +}; + static int __init lowpan_module_init(void) { int ret; @@ -76,6 +106,12 @@ static int __init lowpan_module_init(void) if (ret < 0) return ret; + ret = register_netdevice_notifier(&lowpan_notifier); + if (ret < 0) { + lowpan_debugfs_exit(); + return ret; + } + request_module_nowait("ipv6"); request_module_nowait("nhc_dest"); @@ -92,6 +128,7 @@ static int __init lowpan_module_init(void) static void __exit lowpan_module_exit(void) { lowpan_debugfs_exit(); + unregister_netdevice_notifier(&lowpan_notifier); } module_init(lowpan_module_init); diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 88eef84df..0793a8157 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -16,19 +16,266 @@ #include "6lowpan_i.h" +#define LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS 8 + static struct dentry *lowpan_debugfs; +static int lowpan_ctx_flag_active_set(void *data, u64 val) +{ + struct lowpan_iphc_ctx *ctx = data; + + if (val != 0 && val != 1) + return -EINVAL; + + if (val) + set_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags); + else + clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags); + + return 0; +} + +static int lowpan_ctx_flag_active_get(void *data, u64 *val) +{ + *val = lowpan_iphc_ctx_is_active(data); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_active_fops, + lowpan_ctx_flag_active_get, + lowpan_ctx_flag_active_set, "%llu\n"); + +static int lowpan_ctx_flag_c_set(void *data, u64 val) +{ + struct lowpan_iphc_ctx *ctx = data; + + if (val != 0 && val != 1) + return -EINVAL; + + if (val) + set_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags); + else + clear_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags); + + return 0; +} + +static int lowpan_ctx_flag_c_get(void *data, u64 *val) +{ + *val = lowpan_iphc_ctx_is_compression(data); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get, + lowpan_ctx_flag_c_set, "%llu\n"); + +static int lowpan_ctx_plen_set(void *data, u64 val) +{ + struct lowpan_iphc_ctx *ctx = data; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + + if (val > 128) + return -EINVAL; + + spin_lock_bh(&t->lock); + ctx->plen = val; + spin_unlock_bh(&t->lock); + + return 0; +} + +static int lowpan_ctx_plen_get(void *data, u64 *val) +{ + struct lowpan_iphc_ctx *ctx = data; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + + spin_lock_bh(&t->lock); + *val = ctx->plen; + spin_unlock_bh(&t->lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get, + lowpan_ctx_plen_set, "%llu\n"); + +static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset) +{ + struct lowpan_iphc_ctx *ctx = file->private; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + + spin_lock_bh(&t->lock); + seq_printf(file, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(ctx->pfx.s6_addr16[0]), + be16_to_cpu(ctx->pfx.s6_addr16[1]), + be16_to_cpu(ctx->pfx.s6_addr16[2]), + be16_to_cpu(ctx->pfx.s6_addr16[3]), + be16_to_cpu(ctx->pfx.s6_addr16[4]), + be16_to_cpu(ctx->pfx.s6_addr16[5]), + be16_to_cpu(ctx->pfx.s6_addr16[6]), + be16_to_cpu(ctx->pfx.s6_addr16[7])); + spin_unlock_bh(&t->lock); + + return 0; +} + +static int lowpan_ctx_pfx_open(struct inode *inode, struct file *file) +{ + return single_open(file, lowpan_ctx_pfx_show, inode->i_private); +} + +static ssize_t lowpan_ctx_pfx_write(struct file *fp, + const char __user *user_buf, size_t count, + loff_t *ppos) +{ + char buf[128] = {}; + struct seq_file *file = fp->private_data; + struct lowpan_iphc_ctx *ctx = file->private; + struct lowpan_iphc_ctx_table *t = + container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]); + int status = count, n, i; + unsigned int addr[8]; + + if (copy_from_user(&buf, user_buf, min_t(size_t, sizeof(buf) - 1, + count))) { + status = -EFAULT; + goto out; + } + + n = sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + &addr[0], &addr[1], &addr[2], &addr[3], &addr[4], + &addr[5], &addr[6], &addr[7]); + if (n != LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS) { + status = -EINVAL; + goto out; + } + + spin_lock_bh(&t->lock); + for (i = 0; i < 8; i++) + ctx->pfx.s6_addr16[i] = cpu_to_be16(addr[i] & 0xffff); + spin_unlock_bh(&t->lock); + +out: + return status; +} + +static const struct file_operations lowpan_ctx_pfx_fops = { + .open = lowpan_ctx_pfx_open, + .read = seq_read, + .write = lowpan_ctx_pfx_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, + struct dentry *ctx, u8 id) +{ + struct lowpan_priv *lpriv = lowpan_priv(dev); + struct dentry *dentry, *root; + char buf[32]; + + WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); + + sprintf(buf, "%d", id); + + root = debugfs_create_dir(buf, ctx); + if (!root) + return -EINVAL; + + dentry = debugfs_create_file("active", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_flag_active_fops); + if (!dentry) + return -EINVAL; + + dentry = debugfs_create_file("compression", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_flag_c_fops); + if (!dentry) + return -EINVAL; + + dentry = debugfs_create_file("prefix", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_pfx_fops); + if (!dentry) + return -EINVAL; + + dentry = debugfs_create_file("prefix_len", 0644, root, + &lpriv->ctx.table[id], + &lowpan_ctx_plen_fops); + if (!dentry) + return -EINVAL; + + return 0; +} + +static int lowpan_context_show(struct seq_file *file, void *offset) +{ + struct lowpan_iphc_ctx_table *t = file->private; + int i; + + seq_printf(file, "%3s|%-43s|%c\n", "cid", "prefix", 'C'); + seq_puts(file, "-------------------------------------------------\n"); + + spin_lock_bh(&t->lock); + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + if (!lowpan_iphc_ctx_is_active(&t->table[i])) + continue; + + seq_printf(file, "%3d|%39pI6c/%-3d|%d\n", t->table[i].id, + &t->table[i].pfx, t->table[i].plen, + lowpan_iphc_ctx_is_compression(&t->table[i])); + } + spin_unlock_bh(&t->lock); + + return 0; +} + +static int lowpan_context_open(struct inode *inode, struct file *file) +{ + return single_open(file, lowpan_context_show, inode->i_private); +} + +static const struct file_operations lowpan_context_fops = { + .open = lowpan_context_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + int lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_priv *lpriv = lowpan_priv(dev); + struct dentry *contexts, *dentry; + int ret, i; /* creating the root */ lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); if (!lpriv->iface_debugfs) goto fail; + contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs); + if (!contexts) + goto remove_root; + + dentry = debugfs_create_file("show", 0644, contexts, + &lowpan_priv(dev)->ctx, + &lowpan_context_fops); + if (!dentry) + goto remove_root; + + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i); + if (ret < 0) + goto remove_root; + } + return 0; +remove_root: + lowpan_dev_debugfs_exit(dev); fail: return -EINVAL; } diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 346b5c1a9..99bb22aea 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -56,6 +56,7 @@ /* special link-layer handling */ #include <net/mac802154.h> +#include "6lowpan_i.h" #include "nhc.h" /* Values of fields within the IPHC encoding first byte */ @@ -147,6 +148,9 @@ (((a)->s6_addr16[6]) == 0) && \ (((a)->s6_addr[14]) == 0)) +#define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f) +#define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4) + static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr, const void *lladdr) { @@ -195,6 +199,98 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, } } +static struct lowpan_iphc_ctx * +lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id) +{ + struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id]; + + if (!lowpan_iphc_ctx_is_active(ret)) + return NULL; + + return ret; +} + +static struct lowpan_iphc_ctx * +lowpan_iphc_ctx_get_by_addr(const struct net_device *dev, + const struct in6_addr *addr) +{ + struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *ret = NULL; + struct in6_addr addr_pfx; + u8 addr_plen; + int i; + + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + /* Check if context is valid. A context that is not valid + * MUST NOT be used for compression. + */ + if (!lowpan_iphc_ctx_is_active(&table[i]) || + !lowpan_iphc_ctx_is_compression(&table[i])) + continue; + + ipv6_addr_prefix(&addr_pfx, addr, table[i].plen); + + /* if prefix len < 64, the remaining bits until 64th bit is + * zero. Otherwise we use table[i]->plen. + */ + if (table[i].plen < 64) + addr_plen = 64; + else + addr_plen = table[i].plen; + + if (ipv6_prefix_equal(&addr_pfx, &table[i].pfx, addr_plen)) { + /* remember first match */ + if (!ret) { + ret = &table[i]; + continue; + } + + /* get the context with longest prefix len */ + if (table[i].plen > ret->plen) + ret = &table[i]; + } + } + + return ret; +} + +static struct lowpan_iphc_ctx * +lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, + const struct in6_addr *addr) +{ + struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *ret = NULL; + struct in6_addr addr_mcast, network_pfx = {}; + int i; + + /* init mcast address with */ + memcpy(&addr_mcast, addr, sizeof(*addr)); + + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { + /* Check if context is valid. A context that is not valid + * MUST NOT be used for compression. + */ + if (!lowpan_iphc_ctx_is_active(&table[i]) || + !lowpan_iphc_ctx_is_compression(&table[i])) + continue; + + /* setting plen */ + addr_mcast.s6_addr[3] = table[i].plen; + /* get network prefix to copy into multicast address */ + ipv6_addr_prefix(&network_pfx, &table[i].pfx, + table[i].plen); + /* setting network prefix */ + memcpy(&addr_mcast.s6_addr[4], &network_pfx, 8); + + if (ipv6_addr_equal(addr, &addr_mcast)) { + ret = &table[i]; + break; + } + } + + return ret; +} + /* Uncompress address function for source and * destination address(non-multicast). * @@ -259,30 +355,59 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, /* Uncompress address function for source context * based address(non-multicast). */ -static int uncompress_context_based_src_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, - u8 address_mode) +static int uncompress_ctx_addr(struct sk_buff *skb, + const struct net_device *dev, + const struct lowpan_iphc_ctx *ctx, + struct in6_addr *ipaddr, u8 address_mode, + const void *lladdr) { + bool fail; + switch (address_mode) { - case LOWPAN_IPHC_SAM_00: - /* unspec address :: + /* SAM and DAM are the same here */ + case LOWPAN_IPHC_DAM_00: + fail = false; + /* SAM_00 -> unspec address :: * Do nothing, address is already :: + * + * DAM 00 -> reserved should never occur. */ break; case LOWPAN_IPHC_SAM_01: - /* TODO */ + case LOWPAN_IPHC_DAM_01: + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8); + ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); + break; case LOWPAN_IPHC_SAM_10: - /* TODO */ + case LOWPAN_IPHC_DAM_10: + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2); + ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); + break; case LOWPAN_IPHC_SAM_11: - /* TODO */ - netdev_warn(skb->dev, "SAM value 0x%x not supported\n", - address_mode); - return -EINVAL; + case LOWPAN_IPHC_DAM_11: + fail = false; + switch (lowpan_priv(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + iphc_uncompress_802154_lladdr(ipaddr, lladdr); + break; + default: + iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + break; + } + ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); + break; default: pr_debug("Invalid sam value: 0x%x\n", address_mode); return -EINVAL; } + if (fail) { + pr_debug("Failed to fetch skb data\n"); + return -EIO; + } + raw_dump_inline(NULL, "Reconstructed context based ipv6 src addr is", ipaddr->s6_addr, 16); @@ -346,6 +471,30 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, return 0; } +static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb, + struct lowpan_iphc_ctx *ctx, + struct in6_addr *ipaddr, + u8 address_mode) +{ + struct in6_addr network_pfx = {}; + bool fail; + + ipaddr->s6_addr[0] = 0xFF; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2); + fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4); + if (fail) + return -EIO; + + /* take prefix_len and network prefix from the context */ + ipaddr->s6_addr[3] = ctx->plen; + /* get network prefix to copy into multicast address */ + ipv6_addr_prefix(&network_pfx, &ctx->pfx, ctx->plen); + /* setting network prefix */ + memcpy(&ipaddr->s6_addr[4], &network_pfx, 8); + + return 0; +} + /* get the ecn values from iphc tf format and set it to ipv6hdr */ static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf) { @@ -459,7 +608,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, const void *daddr, const void *saddr) { struct ipv6hdr hdr = {}; - u8 iphc0, iphc1; + struct lowpan_iphc_ctx *ci; + u8 iphc0, iphc1, cid = 0; int err; raw_dump_table(__func__, "raw skb data dump uncompressed", @@ -469,12 +619,14 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1))) return -EINVAL; - /* another if the CID flag is set */ - if (iphc1 & LOWPAN_IPHC_CID) - return -ENOTSUPP; - hdr.version = 6; + /* default CID = 0, another if the CID flag is set */ + if (iphc1 & LOWPAN_IPHC_CID) { + if (lowpan_fetch_skb(skb, &cid, sizeof(cid))) + return -EINVAL; + } + err = lowpan_iphc_tf_decompress(skb, &hdr, iphc0 & LOWPAN_IPHC_TF_MASK); if (err < 0) @@ -500,10 +652,17 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, } if (iphc1 & LOWPAN_IPHC_SAC) { - /* Source address context based uncompression */ + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid)); + if (!ci) { + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + return -EINVAL; + } + pr_debug("SAC bit is set. Handle context based source address.\n"); - err = uncompress_context_based_src_addr(skb, &hdr.saddr, - iphc1 & LOWPAN_IPHC_SAM_MASK); + err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); @@ -515,27 +674,52 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, if (err) return -EINVAL; - /* check for Multicast Compression */ - if (iphc1 & LOWPAN_IPHC_M) { - if (iphc1 & LOWPAN_IPHC_DAC) { - pr_debug("dest: context-based mcast compression\n"); - /* TODO: implement this */ - } else { - err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, - iphc1 & LOWPAN_IPHC_DAM_MASK); + switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) { + case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC: + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); + if (!ci) { + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + return -EINVAL; + } - if (err) - return -EINVAL; + /* multicast with context */ + pr_debug("dest: context-based mcast compression\n"); + err = lowpan_uncompress_multicast_ctx_daddr(skb, ci, + &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK); + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + break; + case LOWPAN_IPHC_M: + /* multicast */ + err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK); + break; + case LOWPAN_IPHC_DAC: + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); + if (!ci) { + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + return -EINVAL; } - } else { + + /* Destination address context based uncompression */ + pr_debug("DAC bit is set. Handle context based destination address.\n"); + err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + break; + default: err = uncompress_addr(skb, dev, &hdr.daddr, iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); pr_debug("dest: stateless compression mode %d dest %pI6c\n", iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr); - if (err) - return -EINVAL; + break; } + if (err) + return -EINVAL; + /* Next header data uncompression */ if (iphc0 & LOWPAN_IPHC_NH) { err = lowpan_nhc_do_uncompression(skb, dev, &hdr); @@ -585,6 +769,58 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = { [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11, }; +static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr, + const struct lowpan_iphc_ctx *ctx, + const unsigned char *lladdr, bool sam) +{ + struct in6_addr tmp = {}; + u8 dam; + + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], lladdr, 8); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + + memset(&tmp, 0, sizeof(tmp)); + /* check for SAM/DAM = 10 */ + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[14], 2); + dam = LOWPAN_IPHC_DAM_10; + goto out; + } + + memset(&tmp, 0, sizeof(tmp)); + /* check for SAM/DAM = 01, should always match */ + memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[8], 8); + dam = LOWPAN_IPHC_DAM_01; + goto out; + } + + WARN_ONCE(1, "context found but no address mode matched\n"); + return LOWPAN_IPHC_DAM_00; +out: + + if (sam) + return lowpan_iphc_dam_to_sam_value[dam]; + else + return dam; +} + static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr, const unsigned char *lladdr, bool sam) { @@ -708,6 +944,21 @@ static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr) return val; } +static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr, + const struct lowpan_iphc_ctx *ctx, + const struct in6_addr *ipaddr) +{ + u8 data[6]; + + /* flags/scope, reserved (RIID) */ + memcpy(data, &ipaddr->s6_addr[1], 2); + /* group ID */ + memcpy(&data[1], &ipaddr->s6_addr[11], 4); + lowpan_push_hc_data(hc_ptr, data, 6); + + return LOWPAN_IPHC_DAM_00; +} + static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr, const struct in6_addr *ipaddr) { @@ -742,10 +993,11 @@ static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr, int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, const void *daddr, const void *saddr) { - u8 iphc0, iphc1, *hc_ptr; + u8 iphc0, iphc1, *hc_ptr, cid = 0; struct ipv6hdr *hdr; u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {}; - int ret, addr_type; + struct lowpan_iphc_ctx *dci, *sci, dci_entry, sci_entry; + int ret, ipv6_daddr_type, ipv6_saddr_type; if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; @@ -769,14 +1021,38 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc0 = LOWPAN_DISPATCH_IPHC; iphc1 = 0; - /* TODO: context lookup */ - raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN); raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN); raw_dump_table(__func__, "sending raw skb network uncompressed packet", skb->data, skb->len); + ipv6_daddr_type = ipv6_addr_type(&hdr->daddr); + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) + dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr); + else + dci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->daddr); + if (dci) { + memcpy(&dci_entry, dci, sizeof(*dci)); + cid |= dci->id; + } + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + + spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr); + if (sci) { + memcpy(&sci_entry, sci, sizeof(*sci)); + cid |= (sci->id << 4); + } + spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + + /* if cid is zero it will be compressed */ + if (cid) { + iphc1 |= LOWPAN_IPHC_CID; + lowpan_push_hc_data(&hc_ptr, &cid, sizeof(cid)); + } + /* Traffic Class, Flow Label compression */ iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr); @@ -813,39 +1089,64 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, sizeof(hdr->hop_limit)); } - addr_type = ipv6_addr_type(&hdr->saddr); + ipv6_saddr_type = ipv6_addr_type(&hdr->saddr); /* source address compression */ - if (addr_type == IPV6_ADDR_ANY) { + if (ipv6_saddr_type == IPV6_ADDR_ANY) { pr_debug("source address is unspecified, setting SAC\n"); iphc1 |= LOWPAN_IPHC_SAC; } else { - if (addr_type & IPV6_ADDR_LINKLOCAL) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr, - saddr, true); - pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", - &hdr->saddr, iphc1); + if (sci) { + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr, + &sci_entry, saddr, + true); + iphc1 |= LOWPAN_IPHC_SAC; } else { - pr_debug("send the full source address\n"); - lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16); + if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) { + iphc1 |= lowpan_compress_addr_64(&hc_ptr, + &hdr->saddr, + saddr, true); + pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", + &hdr->saddr, iphc1); + } else { + pr_debug("send the full source address\n"); + lowpan_push_hc_data(&hc_ptr, + hdr->saddr.s6_addr, 16); + } } } - addr_type = ipv6_addr_type(&hdr->daddr); /* destination address compression */ - if (addr_type & IPV6_ADDR_MULTICAST) { + if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) { pr_debug("destination address is multicast: "); iphc1 |= LOWPAN_IPHC_M; - iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr); + if (dci) { + iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr, + &dci_entry, + &hdr->daddr); + iphc1 |= LOWPAN_IPHC_DAC; + } else { + iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, + &hdr->daddr); + } } else { - if (addr_type & IPV6_ADDR_LINKLOCAL) { - /* TODO: context lookup */ - iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr, - daddr, false); - pr_debug("dest address unicast link-local %pI6c " - "iphc1 0x%02x\n", &hdr->daddr, iphc1); + if (dci) { + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr, + &dci_entry, daddr, + false); + iphc1 |= LOWPAN_IPHC_DAC; } else { - pr_debug("dest address unicast %pI6c\n", &hdr->daddr); - lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16); + if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) { + iphc1 |= lowpan_compress_addr_64(&hc_ptr, + &hdr->daddr, + daddr, false); + pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n", + &hdr->daddr, iphc1); + } else { + pr_debug("dest address unicast %pI6c\n", + &hdr->daddr); + lowpan_push_hc_data(&hc_ptr, + hdr->daddr.s6_addr, 16); + } } } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index d2cd9de4b..a1e273af6 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -261,7 +261,6 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; - new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT); vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); @@ -312,6 +311,7 @@ static void vlan_transfer_features(struct net_device *dev, struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); vlandev->gso_max_size = dev->gso_max_size; + vlandev->gso_max_segs = dev->gso_max_segs; if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ad5e2fd10..e7e62570b 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -551,6 +551,7 @@ static int vlan_dev_init(struct net_device *dev) dev->features |= real_dev->vlan_features | NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE; dev->gso_max_size = real_dev->gso_max_size; + dev->gso_max_segs = real_dev->gso_max_segs; if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); @@ -621,12 +622,12 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, return features; } -static int vlan_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) +static int vlan_ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - return __ethtool_get_settings(vlan->real_dev, cmd); + return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, @@ -741,7 +742,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev) } static const struct ethtool_ops vlan_ethtool_ops = { - .get_settings = vlan_ethtool_get_settings, + .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, @@ -799,6 +800,7 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; + dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index ae63cf72a..5f1446c9f 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -184,12 +184,11 @@ int vlan_proc_add_dev(struct net_device *vlandev) /* * Delete directory entry for VLAN device. */ -int vlan_proc_rem_dev(struct net_device *vlandev) +void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; - return 0; } /****** Proc filesystem entry points ****************************************/ diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h index 063f60a3d..8838a2e92 100644 --- a/net/8021q/vlanproc.h +++ b/net/8021q/vlanproc.h @@ -5,7 +5,7 @@ struct net; int vlan_proc_init(struct net *net); -int vlan_proc_rem_dev(struct net_device *vlandev); +void vlan_proc_rem_dev(struct net_device *vlandev); int vlan_proc_add_dev(struct net_device *vlandev); void vlan_proc_cleanup(struct net *net); @@ -14,7 +14,7 @@ void vlan_proc_cleanup(struct net *net); #define vlan_proc_init(net) (0) #define vlan_proc_cleanup(net) do {} while (0) #define vlan_proc_add_dev(dev) ({(void)(dev), 0; }) -#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; }) +#define vlan_proc_rem_dev(dev) do {} while (0) #endif #endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 52b4a2f99..1852e383a 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -109,14 +109,13 @@ struct p9_trans_rdma { /** * p9_rdma_context - Keeps track of in-process WR * - * @wc_op: The original WR op for when the CQE completes in error. * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ struct p9_rdma_req; struct p9_rdma_context { - enum ib_wc_opcode wc_op; + struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; @@ -284,9 +283,12 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) } static void -handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; @@ -295,7 +297,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); - if (status != IB_WC_SUCCESS) + if (wc->status != IB_WC_SUCCESS) goto err_out; err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); @@ -316,21 +318,32 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, req->rc = c->rc; p9_client_cb(client, req, REQ_STATUS_RCVD); + out: + up(&rdma->rq_sem); + kfree(c); return; err_out: - p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status); + p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; + goto out; } static void -handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); + ib_dma_unmap_single(rdma->cm_id->device, c->busa, c->req->tc->size, DMA_TO_DEVICE); + up(&rdma->sq_sem); + kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) @@ -339,42 +352,6 @@ static void qp_event_handler(struct ib_event *event, void *context) event->event, context); } -static void cq_comp_handler(struct ib_cq *cq, void *cq_context) -{ - struct p9_client *client = cq_context; - struct p9_trans_rdma *rdma = client->trans; - int ret; - struct ib_wc wc; - - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; - - switch (c->wc_op) { - case IB_WC_RECV: - handle_recv(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->rq_sem); - break; - - case IB_WC_SEND: - handle_send(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->sq_sem); - break; - - default: - pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n", - c->wc_op, wc.opcode, wc.status); - break; - } - kfree(c); - } -} - -static void cq_event_handler(struct ib_event *e, void *v) -{ - p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); -} - static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) @@ -387,7 +364,7 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) - ib_destroy_cq(rdma->cq); + ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); @@ -408,13 +385,14 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; + c->cqe.done = recv_done; + sge.addr = c->busa; sge.length = client->msize; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_RECV; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; return ib_post_recv(rdma->qp, &wr, &bad_wr); @@ -499,13 +477,14 @@ dont_need_post_recv: goto send_error; } + c->cqe.done = send_done; + sge.addr = c->busa; sge.length = c->req->tc->size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_SEND; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; @@ -642,7 +621,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); @@ -695,13 +673,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) goto error; /* Create the Completion Queue */ - cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; - rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, - cq_event_handler, client, - &cq_attr); + rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, + opts.sq_depth + opts.rq_depth + 1, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); /* Create the Protection Domain */ rdma->pd = ib_alloc_pd(rdma->cm_id->device); diff --git a/net/Kconfig b/net/Kconfig index 174354618..a8934d8c8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -253,6 +253,9 @@ config XPS depends on SMP default y +config HWBM + bool + config SOCK_CGROUP_DATA bool default n @@ -360,6 +363,7 @@ source "net/can/Kconfig" source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" +source "net/kcm/Kconfig" config FIB_RULES bool @@ -392,6 +396,26 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config DST_CACHE + bool + default n + +config NET_DEVLINK + tristate "Network physical/parent device Netlink interface" + help + Network physical/parent device Netlink interface provides + infrastructure to support access to physical chip-wide config and + monitoring. + +config MAY_USE_DEVLINK + tristate + default m if NET_DEVLINK=m + default y if NET_DEVLINK=y || NET_DEVLINK=n + help + Drivers using the devlink infrastructure should have a dependency + on MAY_USE_DEVLINK to ensure they do not cause link errors when + devlink is a loadable module and the driver using it is built-in. + endif # if NET # Used by archs to tell that they support BPF_JIT diff --git a/net/Makefile b/net/Makefile index a5d04098d..81d14119e 100644 --- a/net/Makefile +++ b/net/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_AF_RXRPC) += rxrpc/ +obj-$(CONFIG_AF_KCM) += kcm/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ obj-$(CONFIG_DECNET) += decnet/ diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c6fc8f756..f66930ee3 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -12,9 +12,23 @@ config BATMAN_ADV B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The networks may be wired or wireless. See - http://www.open-mesh.org/ for more information and user space + https://www.open-mesh.org/ for more information and user space tools. +config BATMAN_ADV_BATMAN_V + bool "B.A.T.M.A.N. V protocol (experimental)" + depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m) + default n + help + This option enables the B.A.T.M.A.N. V protocol, the successor + of the currently used B.A.T.M.A.N. IV protocol. The main + changes include splitting of the OGM protocol into a neighbor + discovery protocol (Echo Location Protocol, ELP) and a new OGM + Protocol OGMv2 for flooding protocol information through the + network, as well as a throughput based metric. + B.A.T.M.A.N. V is currently considered experimental and not + compatible to B.A.T.M.A.N. IV networks. + config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 21434ab79..797cf2fc8 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # @@ -18,6 +18,9 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_iv_ogm.o +batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o +batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o +batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 4e59cf3eb..03dafd33d 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,6 +1,6 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * - * Marek Lindner + * Marek Lindner, Linus Lüssing * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -18,6 +18,32 @@ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ #define _NET_BATMAN_ADV_BAT_ALGO_H_ +struct batadv_priv; + int batadv_iv_init(void); +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + +int batadv_v_init(void); +int batadv_v_mesh_init(struct batadv_priv *bat_priv); +void batadv_v_mesh_free(struct batadv_priv *bat_priv); + +#else + +static inline int batadv_v_init(void) +{ + return 0; +} + +static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv) +{ +} + +#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ + #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index df625de55..cb2d1b9b0 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -31,6 +31,7 @@ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/list.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/printk.h> @@ -88,7 +89,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) * in the given ring buffer * @lq_recv: pointer to the ring buffer * - * Returns computed average value. + * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { @@ -132,7 +133,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * - * Returns 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, int max_if_num) @@ -180,7 +181,7 @@ unlock: * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed * - * Returns 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num) @@ -246,7 +247,7 @@ unlock: * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * - * Returns the originator object corresponding to the passed mac address or NULL + * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exists it is created an initialised. */ @@ -286,8 +287,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) free_orig_node: /* free twice, as batadv_orig_node_new sets refcount to 2 */ - batadv_orig_node_free_ref(orig_node); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NULL; } @@ -396,7 +397,14 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) return new_tq; } -/* is there another aggregated packet here? */ +/** + * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached + * @buff_pos: current position in the skb + * @packet_len: total length of the skb + * @tvlv_len: tvlv length of the previously considered OGM + * + * Return: true if there is enough space for another OGM, false otherwise. + */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, __be16 tvlv_len) { @@ -470,7 +478,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); - batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); + batadv_send_broadcast_skb(skb, hard_iface); } } @@ -507,7 +515,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -522,7 +530,7 @@ out: * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * - * Returns true if new_packet can be aggregated with forw_packet + * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, @@ -609,7 +617,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return res; } @@ -636,10 +644,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, unsigned char *skb_buff; unsigned int skb_size; - if (!atomic_inc_not_zero(&if_incoming->refcount)) + if (!kref_get_unless_zero(&if_incoming->refcount)) return; - if (!atomic_inc_not_zero(&if_outgoing->refcount)) + if (!kref_get_unless_zero(&if_outgoing->refcount)) goto out_free_incoming; /* own packet should always be scheduled */ @@ -703,9 +711,9 @@ out_nomem: if (!own_packet) atomic_inc(&bat_priv->batman_queue_left); out_free_outgoing: - batadv_hardif_free_ref(if_outgoing); + batadv_hardif_put(if_outgoing); out_free_incoming: - batadv_hardif_free_ref(if_incoming); + batadv_hardif_put(if_incoming); } /* aggregate a new packet into the existing ogm packet */ @@ -950,7 +958,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -995,9 +1003,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && - atomic_inc_not_zero(&tmp_neigh_node->refcount)) { + kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } @@ -1018,7 +1026,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } @@ -1033,7 +1041,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, ethhdr->h_source, orig_node, orig_tmp); - batadv_orig_node_free_ref(orig_tmp); + batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { @@ -1108,13 +1116,13 @@ unlock: rcu_read_unlock(); out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (neigh_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -1125,7 +1133,7 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Returns 1 if the link can be considered bidirectional, 0 otherwise + * Return: 1 if the link can be considered bidirectional, 0 otherwise */ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, @@ -1154,7 +1162,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (tmp_neigh_node->if_incoming != if_incoming) continue; - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; @@ -1184,7 +1192,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } @@ -1257,7 +1265,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -1269,7 +1277,7 @@ out: * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Returns duplicate status as enum batadv_dup_status + * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, @@ -1298,7 +1306,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return 0; } @@ -1308,7 +1316,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, - &orig_ifinfo->batman_seqno_reset)) { + BATADV_TQ_LOCAL_WINDOW_SIZE, + &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } @@ -1344,7 +1353,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); @@ -1358,8 +1367,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - batadv_orig_node_free_ref(orig_node); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_node_put(orig_node); + batadv_orig_ifinfo_put(orig_ifinfo); return ret; } @@ -1505,7 +1514,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, ogm_packet, if_incoming, if_outgoing, dup_status); } - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) @@ -1554,18 +1563,18 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, out_neigh: if ((orig_neigh_node) && (!is_single_hop_neigh)) - batadv_orig_node_free_ref(orig_neigh_node); + batadv_orig_node_put(orig_neigh_node); out: if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_router) - batadv_neigh_node_free_ref(router_router); + batadv_neigh_node_put(router_router); if (orig_neigh_router) - batadv_neigh_node_free_ref(orig_neigh_router); + batadv_neigh_node_put(orig_neigh_router); if (hardif_neigh) - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); kfree_skb(skb_priv); } @@ -1688,7 +1697,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); - batadv_orig_node_free_ref(orig_neigh_node); + batadv_orig_node_put(orig_neigh_node); return; } @@ -1726,7 +1735,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, } rcu_read_unlock(); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static int batadv_iv_ogm_receive(struct sk_buff *skb, @@ -1796,7 +1805,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node, neigh_node->addr, n_ifinfo->bat_iv.tq_avg); - batadv_neigh_ifinfo_free_ref(n_ifinfo); + batadv_neigh_ifinfo_put(n_ifinfo); } } @@ -1859,9 +1868,9 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, batman_count++; next: - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (n_ifinfo) - batadv_neigh_ifinfo_free_ref(n_ifinfo); + batadv_neigh_ifinfo_put(n_ifinfo); } rcu_read_unlock(); } @@ -1929,7 +1938,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * - * Returns a value less, equal to or greater than 0 if the metric via neigh1 is + * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, @@ -1955,9 +1964,9 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, out: if (neigh1_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); if (neigh2_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return diff; } @@ -1970,7 +1979,7 @@ out: * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * - * Returns true if the metric via neigh1 is equally good or better than + * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool @@ -1998,9 +2007,9 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, out: if (neigh1_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); if (neigh2_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c new file mode 100644 index 000000000..4026f198a --- /dev/null +++ b/net/batman-adv/bat_v.c @@ -0,0 +1,359 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Linus Lüssing, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bat_algo.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "bat_v_elp.h" +#include "bat_v_ogm.h" +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" + +static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can + * set the interface as ACTIVE right away, without any risk of race + * condition + */ + if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) + hard_iface->if_status = BATADV_IF_ACTIVE; +} + +static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) +{ + int ret; + + ret = batadv_v_elp_iface_enable(hard_iface); + if (ret < 0) + return ret; + + ret = batadv_v_ogm_iface_enable(hard_iface); + if (ret < 0) + batadv_v_elp_iface_disable(hard_iface); + + /* enable link throughput auto-detection by setting the throughput + * override to zero + */ + atomic_set(&hard_iface->bat_v.throughput_override, 0); + + return ret; +} + +static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) +{ + batadv_v_elp_iface_disable(hard_iface); +} + +static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) +{ +} + +static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) +{ + batadv_v_elp_primary_iface_set(hard_iface); + batadv_v_ogm_primary_iface_set(hard_iface); +} + +static void +batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) +{ + ewma_throughput_init(&hardif_neigh->bat_v.throughput); + INIT_WORK(&hardif_neigh->bat_v.metric_work, + batadv_v_elp_throughput_metric_update); +} + +static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface) +{ +} + +static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet) +{ +} + +/** + * batadv_v_orig_print_neigh - print neighbors for the originator table + * @orig_node: the orig_node for which the neighbors are printed + * @if_outgoing: outgoing interface for these entries + * @seq: debugfs table seq_file struct + * + * Must be called while holding an rcu lock. + */ +static void +batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing, + struct seq_file *seq) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_neigh_ifinfo *n_ifinfo; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + continue; + + seq_printf(seq, " %pM (%9u.%1u)", + neigh_node->addr, + n_ifinfo->bat_v.throughput / 10, + n_ifinfo->bat_v.throughput % 10); + + batadv_neigh_ifinfo_put(n_ifinfo); + } +} + +/** + * batadv_v_hardif_neigh_print - print a single ELP neighbour node + * @seq: neighbour table seq_file struct + * @hardif_neigh: hardif neighbour information + */ +static void +batadv_v_hardif_neigh_print(struct seq_file *seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + int last_secs, last_msecs; + u32 throughput; + + last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000; + last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000; + throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + + seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n", + hardif_neigh->addr, last_secs, last_msecs, throughput / 10, + throughput % 10, hardif_neigh->if_incoming->net_dev->name); +} + +/** + * batadv_v_neigh_print - print the single hop neighbour list + * @bat_priv: the bat priv with all the soft interface information + * @seq: neighbour table seq_file struct + */ +static void batadv_v_neigh_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_hard_iface *hard_iface; + int batman_count = 0; + + seq_printf(seq, " %-15s %s (%11s) [%10s]\n", "Neighbor", + "last-seen", "throughput", "IF"); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != net_dev) + continue; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + batadv_v_hardif_neigh_print(seq, hardif_neigh); + batman_count++; + } + } + rcu_read_unlock(); + + if (batman_count == 0) + seq_puts(seq, "No batman nodes in range ...\n"); +} + +/** + * batadv_v_orig_print - print the originator table + * @bat_priv: the bat priv with all the soft interface information + * @seq: debugfs table seq_file struct + * @if_outgoing: the outgoing interface for which this should be printed + */ +static void batadv_v_orig_print(struct batadv_priv *bat_priv, + struct seq_file *seq, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_hashtable *hash = bat_priv->orig_hash; + int last_seen_msecs, last_seen_secs; + struct batadv_orig_node *orig_node; + struct batadv_neigh_ifinfo *n_ifinfo; + unsigned long last_seen_jiffies; + struct hlist_head *head; + int batman_count = 0; + u32 i; + + seq_printf(seq, " %-15s %s (%11s) %17s [%10s]: %20s ...\n", + "Originator", "last-seen", "throughput", "Nexthop", + "outgoingIF", "Potential nexthops"); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + neigh_node = batadv_orig_router_get(orig_node, + if_outgoing); + if (!neigh_node) + continue; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, + if_outgoing); + if (!n_ifinfo) + goto next; + + last_seen_jiffies = jiffies - orig_node->last_seen; + last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); + last_seen_secs = last_seen_msecs / 1000; + last_seen_msecs = last_seen_msecs % 1000; + + seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:", + orig_node->orig, last_seen_secs, + last_seen_msecs, + n_ifinfo->bat_v.throughput / 10, + n_ifinfo->bat_v.throughput % 10, + neigh_node->addr, + neigh_node->if_incoming->net_dev->name); + + batadv_v_orig_print_neigh(orig_node, if_outgoing, seq); + seq_puts(seq, "\n"); + batman_count++; + +next: + batadv_neigh_node_put(neigh_node); + if (n_ifinfo) + batadv_neigh_ifinfo_put(n_ifinfo); + } + rcu_read_unlock(); + } + + if (batman_count == 0) + seq_puts(seq, "No batman nodes in range ...\n"); +} + +static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) +{ + struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; + + ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + + if (WARN_ON(!ifinfo1 || !ifinfo2)) + return 0; + + return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; +} + +static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) +{ + struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; + u32 threshold; + + ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + + threshold = ifinfo1->bat_v.throughput / 4; + threshold = ifinfo1->bat_v.throughput - threshold; + + return ifinfo2->bat_v.throughput > threshold; +} + +static struct batadv_algo_ops batadv_batman_v __read_mostly = { + .name = "BATMAN_V", + .bat_iface_activate = batadv_v_iface_activate, + .bat_iface_enable = batadv_v_iface_enable, + .bat_iface_disable = batadv_v_iface_disable, + .bat_iface_update_mac = batadv_v_iface_update_mac, + .bat_primary_iface_set = batadv_v_primary_iface_set, + .bat_hardif_neigh_init = batadv_v_hardif_neigh_init, + .bat_ogm_emit = batadv_v_ogm_emit, + .bat_ogm_schedule = batadv_v_ogm_schedule, + .bat_orig_print = batadv_v_orig_print, + .bat_neigh_cmp = batadv_v_neigh_cmp, + .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob, + .bat_neigh_print = batadv_v_neigh_print, +}; + +/** + * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a + * mesh + * @bat_priv: the object representing the mesh interface to initialise + * + * Return: 0 on success or a negative error code otherwise + */ +int batadv_v_mesh_init(struct batadv_priv *bat_priv) +{ + return batadv_v_ogm_init(bat_priv); +} + +/** + * batadv_v_mesh_free - free the B.A.T.M.A.N. V private resources for a mesh + * @bat_priv: the object representing the mesh interface to free + */ +void batadv_v_mesh_free(struct batadv_priv *bat_priv) +{ + batadv_v_ogm_free(bat_priv); +} + +/** + * batadv_v_init - B.A.T.M.A.N. V initialization function + * + * Description: Takes care of initializing all the subcomponents. + * It is invoked upon module load only. + * + * Return: 0 on success or a negative error code otherwise + */ +int __init batadv_v_init(void) +{ + int ret; + + /* B.A.T.M.A.N. V echo location protocol packet */ + ret = batadv_recv_handler_register(BATADV_ELP, + batadv_v_elp_packet_recv); + if (ret < 0) + return ret; + + ret = batadv_recv_handler_register(BATADV_OGM2, + batadv_v_ogm_packet_recv); + if (ret < 0) + goto elp_unregister; + + ret = batadv_algo_register(&batadv_batman_v); + if (ret < 0) + goto ogm_unregister; + + return ret; + +ogm_unregister: + batadv_recv_handler_unregister(BATADV_OGM2); + +elp_unregister: + batadv_recv_handler_unregister(BATADV_ELP); + + return ret; +} diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c new file mode 100644 index 000000000..3844e7efd --- /dev/null +++ b/net/batman-adv/bat_v_elp.c @@ -0,0 +1,515 @@ +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: + * + * Linus Lüssing, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bat_v_elp.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kref.h> +#include <linux/netdevice.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <net/cfg80211.h> + +#include "bat_algo.h" +#include "bat_v_ogm.h" +#include "hard-interface.h" +#include "originator.h" +#include "packet.h" +#include "routing.h" +#include "send.h" + +/** + * batadv_v_elp_start_timer - restart timer for ELP periodic work + * @hard_iface: the interface for which the timer has to be reset + */ +static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) +{ + unsigned int msecs; + + msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; + msecs += prandom_u32() % (2 * BATADV_JITTER); + + queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, + msecs_to_jiffies(msecs)); +} + +/** + * batadv_v_elp_get_throughput - get the throughput towards a neighbour + * @neigh: the neighbour for which the throughput has to be obtained + * + * Return: The throughput towards the given neighbour in multiples of 100kpbs + * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + */ +static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +{ + struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct ethtool_link_ksettings link_settings; + struct station_info sinfo; + u32 throughput; + int ret; + + /* if the user specified a customised value for this interface, then + * return it directly + */ + throughput = atomic_read(&hard_iface->bat_v.throughput_override); + if (throughput != 0) + return throughput; + + /* if this is a wireless device, then ask its throughput through + * cfg80211 API + */ + if (batadv_is_wifi_netdev(hard_iface->net_dev)) { + if (hard_iface->net_dev->ieee80211_ptr) { + ret = cfg80211_get_station(hard_iface->net_dev, + neigh->addr, &sinfo); + if (ret == -ENOENT) { + /* Node is not associated anymore! It would be + * possible to delete this neighbor. For now set + * the throughput metric to 0. + */ + return 0; + } + if (!ret) + return sinfo.expected_throughput / 100; + } + + /* unsupported WiFi driver version */ + goto default_throughput; + } + + /* if not a wifi interface, check if this device provides data via + * ethtool (e.g. an Ethernet adapter) + */ + memset(&link_settings, 0, sizeof(link_settings)); + rtnl_lock(); + ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); + rtnl_unlock(); + if (ret == 0) { + /* link characteristics might change over time */ + if (link_settings.base.duplex == DUPLEX_FULL) + hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; + else + hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; + + throughput = link_settings.base.speed; + if (throughput && (throughput != SPEED_UNKNOWN)) + return throughput * 10; + } + +default_throughput: + if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { + batadv_info(hard_iface->soft_iface, + "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", + hard_iface->net_dev->name, + BATADV_THROUGHPUT_DEFAULT_VALUE / 10, + BATADV_THROUGHPUT_DEFAULT_VALUE % 10); + hard_iface->bat_v.flags |= BATADV_WARNING_DEFAULT; + } + + /* if none of the above cases apply, return the base_throughput */ + return BATADV_THROUGHPUT_DEFAULT_VALUE; +} + +/** + * batadv_v_elp_throughput_metric_update - worker updating the throughput metric + * of a single hop neighbour + * @work: the work queue item + */ +void batadv_v_elp_throughput_metric_update(struct work_struct *work) +{ + struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; + struct batadv_hardif_neigh_node *neigh; + + neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, + metric_work); + neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, + bat_v); + + ewma_throughput_add(&neigh->bat_v.throughput, + batadv_v_elp_get_throughput(neigh)); + + /* decrement refcounter to balance increment performed before scheduling + * this task + */ + batadv_hardif_neigh_put(neigh); +} + +/** + * batadv_v_elp_wifi_neigh_probe - send link probing packets to a neighbour + * @neigh: the neighbour to probe + * + * Sends a predefined number of unicast wifi packets to a given neighbour in + * order to trigger the throughput estimation on this link by the RC algorithm. + * Packets are sent only if there there is not enough payload unicast traffic + * towards this neighbour.. + * + * Return: True on success and false in case of error during skb preparation. + */ +static bool +batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) +{ + struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + unsigned long last_tx_diff; + struct sk_buff *skb; + int probe_len, i; + int elp_skb_len; + + /* this probing routine is for Wifi neighbours only */ + if (!batadv_is_wifi_netdev(hard_iface->net_dev)) + return true; + + /* probe the neighbor only if no unicast packets have been sent + * to it in the last 100 milliseconds: this is the rate control + * algorithm sampling interval (minstrel). In this way, if not + * enough traffic has been sent to the neighbor, batman-adv can + * generate 2 probe packets and push the RC algorithm to perform + * the sampling + */ + last_tx_diff = jiffies_to_msecs(jiffies - neigh->bat_v.last_unicast_tx); + if (last_tx_diff <= BATADV_ELP_PROBE_MAX_TX_DIFF) + return true; + + probe_len = max_t(int, sizeof(struct batadv_elp_packet), + BATADV_ELP_MIN_PROBE_SIZE); + + for (i = 0; i < BATADV_ELP_PROBES_PER_NODE; i++) { + elp_skb_len = hard_iface->bat_v.elp_skb->len; + skb = skb_copy_expand(hard_iface->bat_v.elp_skb, 0, + probe_len - elp_skb_len, + GFP_ATOMIC); + if (!skb) + return false; + + /* Tell the skb to get as big as the allocated space (we want + * the packet to be exactly of that size to make the link + * throughput estimation effective. + */ + skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len); + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Sending unicast (probe) ELP packet on interface %s to %pM\n", + hard_iface->net_dev->name, neigh->addr); + + batadv_send_skb_packet(skb, hard_iface, neigh->addr); + } + + return true; +} + +/** + * batadv_v_elp_periodic_work - ELP periodic task per interface + * @work: work queue item + * + * Emits broadcast ELP message in regular intervals. + */ +static void batadv_v_elp_periodic_work(struct work_struct *work) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_hard_iface *hard_iface; + struct batadv_hard_iface_bat_v *bat_v; + struct batadv_elp_packet *elp_packet; + struct batadv_priv *bat_priv; + struct sk_buff *skb; + u32 elp_interval; + + bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); + hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); + bat_priv = netdev_priv(hard_iface->soft_iface); + + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) + goto out; + + /* we are in the process of shutting this interface down */ + if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || + (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + goto out; + + /* the interface was enabled but may not be ready yet */ + if (hard_iface->if_status != BATADV_IF_ACTIVE) + goto restart_timer; + + skb = skb_copy(hard_iface->bat_v.elp_skb, GFP_ATOMIC); + if (!skb) + goto restart_timer; + + elp_packet = (struct batadv_elp_packet *)skb->data; + elp_packet->seqno = htonl(atomic_read(&hard_iface->bat_v.elp_seqno)); + elp_interval = atomic_read(&hard_iface->bat_v.elp_interval); + elp_packet->elp_interval = htonl(elp_interval); + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Sending broadcast ELP packet on interface %s, seqno %u\n", + hard_iface->net_dev->name, + atomic_read(&hard_iface->bat_v.elp_seqno)); + + batadv_send_broadcast_skb(skb, hard_iface); + + atomic_inc(&hard_iface->bat_v.elp_seqno); + + /* The throughput metric is updated on each sent packet. This way, if a + * node is dead and no longer sends packets, batman-adv is still able to + * react timely to its death. + * + * The throughput metric is updated by following these steps: + * 1) if the hard_iface is wifi => send a number of unicast ELPs for + * probing/sampling to each neighbor + * 2) update the throughput metric value of each neighbor (note that the + * value retrieved in this step might be 100ms old because the + * probing packets at point 1) could still be in the HW queue) + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { + if (!batadv_v_elp_wifi_neigh_probe(hardif_neigh)) + /* if something goes wrong while probing, better to stop + * sending packets immediately and reschedule the task + */ + break; + + if (!kref_get_unless_zero(&hardif_neigh->refcount)) + continue; + + /* Reading the estimated throughput from cfg80211 is a task that + * may sleep and that is not allowed in an rcu protected + * context. Therefore schedule a task for that. + */ + queue_work(batadv_event_workqueue, + &hardif_neigh->bat_v.metric_work); + } + rcu_read_unlock(); + +restart_timer: + batadv_v_elp_start_timer(hard_iface); +out: + return; +} + +/** + * batadv_v_elp_iface_enable - setup the ELP interface private resources + * @hard_iface: interface for which the data has to be prepared + * + * Return: 0 on success or a -ENOMEM in case of failure. + */ +int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) +{ + struct batadv_elp_packet *elp_packet; + unsigned char *elp_buff; + u32 random_seqno; + size_t size; + int res = -ENOMEM; + + size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN; + hard_iface->bat_v.elp_skb = dev_alloc_skb(size); + if (!hard_iface->bat_v.elp_skb) + goto out; + + skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); + elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_packet = (struct batadv_elp_packet *)elp_buff; + memset(elp_packet, 0, BATADV_ELP_HLEN); + + elp_packet->packet_type = BATADV_ELP; + elp_packet->version = BATADV_COMPAT_VERSION; + + /* randomize initial seqno to avoid collision */ + get_random_bytes(&random_seqno, sizeof(random_seqno)); + atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno); + atomic_set(&hard_iface->bat_v.elp_interval, 500); + + /* assume full-duplex by default */ + hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; + + /* warn the user (again) if there is no throughput data is available */ + hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT; + + if (batadv_is_wifi_netdev(hard_iface->net_dev)) + hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; + + INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq, + batadv_v_elp_periodic_work); + batadv_v_elp_start_timer(hard_iface); + res = 0; + +out: + return res; +} + +/** + * batadv_v_elp_iface_disable - release ELP interface private resources + * @hard_iface: interface for which the resources have to be released + */ +void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) +{ + cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq); + + dev_kfree_skb(hard_iface->bat_v.elp_skb); + hard_iface->bat_v.elp_skb = NULL; +} + +/** + * batadv_v_elp_primary_iface_set - change internal data to reflect the new + * primary interface + * @primary_iface: the new primary interface + */ +void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_elp_packet *elp_packet; + struct sk_buff *skb; + + /* update orig field of every elp iface belonging to this mesh */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (primary_iface->soft_iface != hard_iface->soft_iface) + continue; + + if (!hard_iface->bat_v.elp_skb) + continue; + + skb = hard_iface->bat_v.elp_skb; + elp_packet = (struct batadv_elp_packet *)skb->data; + ether_addr_copy(elp_packet->orig, + primary_iface->net_dev->dev_addr); + } + rcu_read_unlock(); +} + +/** + * batadv_v_elp_neigh_update - update an ELP neighbour node + * @bat_priv: the bat priv with all the soft interface information + * @neigh_addr: the neighbour interface address + * @if_incoming: the interface the packet was received through + * @elp_packet: the received ELP packet + * + * Updates the ELP neighbour node state with the data received within the new + * ELP packet. + */ +static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv, + u8 *neigh_addr, + struct batadv_hard_iface *if_incoming, + struct batadv_elp_packet *elp_packet) + +{ + struct batadv_neigh_node *neigh; + struct batadv_orig_node *orig_neigh; + struct batadv_hardif_neigh_node *hardif_neigh; + s32 seqno_diff; + s32 elp_latest_seqno; + + orig_neigh = batadv_v_ogm_orig_get(bat_priv, elp_packet->orig); + if (!orig_neigh) + return; + + neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr); + if (!neigh) + goto orig_free; + + hardif_neigh = batadv_hardif_neigh_get(if_incoming, neigh_addr); + if (!hardif_neigh) + goto neigh_free; + + elp_latest_seqno = hardif_neigh->bat_v.elp_latest_seqno; + seqno_diff = ntohl(elp_packet->seqno) - elp_latest_seqno; + + /* known or older sequence numbers are ignored. However always adopt + * if the router seems to have been restarted. + */ + if (seqno_diff < 1 && seqno_diff > -BATADV_ELP_MAX_AGE) + goto hardif_free; + + neigh->last_seen = jiffies; + hardif_neigh->last_seen = jiffies; + hardif_neigh->bat_v.elp_latest_seqno = ntohl(elp_packet->seqno); + hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval); + +hardif_free: + if (hardif_neigh) + batadv_hardif_neigh_put(hardif_neigh); +neigh_free: + if (neigh) + batadv_neigh_node_put(neigh); +orig_free: + if (orig_neigh) + batadv_orig_node_put(orig_neigh); +} + +/** + * batadv_v_elp_packet_recv - main ELP packet handler + * @skb: the received packet + * @if_incoming: the interface this packet was received through + * + * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * processed or NET_RX_DROP in case of failure. + */ +int batadv_v_elp_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_elp_packet *elp_packet; + struct batadv_hard_iface *primary_if; + struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + bool ret; + + ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN); + if (!ret) + return NET_RX_DROP; + + if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) + return NET_RX_DROP; + + /* did we receive a B.A.T.M.A.N. V ELP packet on an interface + * that does not have B.A.T.M.A.N. V ELP enabled ? + */ + if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + return NET_RX_DROP; + + elp_packet = (struct batadv_elp_packet *)skb->data; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Received ELP packet from %pM seqno %u ORIG: %pM\n", + ethhdr->h_source, ntohl(elp_packet->seqno), + elp_packet->orig); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming, + elp_packet); + +out: + if (primary_if) + batadv_hardif_put(primary_if); + consume_skb(skb); + return NET_RX_SUCCESS; +} diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h new file mode 100644 index 000000000..e95f1bca0 --- /dev/null +++ b/net/batman-adv/bat_v_elp.h @@ -0,0 +1,33 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Linus Lüssing, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "main.h" + +#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_ +#define _NET_BATMAN_ADV_BAT_V_ELP_H_ + +struct sk_buff; +struct work_struct; + +int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); +void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); +void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); +int batadv_v_elp_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming); +void batadv_v_elp_throughput_metric_update(struct work_struct *work); + +#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c new file mode 100644 index 000000000..91df28a10 --- /dev/null +++ b/net/batman-adv/bat_v_ogm.c @@ -0,0 +1,835 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bat_v_ogm.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" +#include "routing.h" +#include "send.h" +#include "translation-table.h" + +/** + * batadv_v_ogm_orig_get - retrieve and possibly create an originator node + * @bat_priv: the bat priv with all the soft interface information + * @addr: the address of the originator + * + * Return: the orig_node corresponding to the specified address. If such object + * does not exist it is allocated here. In case of allocation failure returns + * NULL. + */ +struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, + const u8 *addr) +{ + struct batadv_orig_node *orig_node; + int hash_added; + + orig_node = batadv_orig_hash_find(bat_priv, addr); + if (orig_node) + return orig_node; + + orig_node = batadv_orig_node_new(bat_priv, addr); + if (!orig_node) + return NULL; + + hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, + batadv_choose_orig, orig_node, + &orig_node->hash_entry); + if (hash_added != 0) { + /* orig_node->refcounter is initialised to 2 by + * batadv_orig_node_new() + */ + batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); + orig_node = NULL; + } + + return orig_node; +} + +/** + * batadv_v_ogm_start_timer - restart the OGM sending timer + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) +{ + unsigned long msecs; + /* this function may be invoked in different contexts (ogm rescheduling + * or hard_iface activation), but the work timer should not be reset + */ + if (delayed_work_pending(&bat_priv->bat_v.ogm_wq)) + return; + + msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; + msecs += prandom_u32() % (2 * BATADV_JITTER); + queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, + msecs_to_jiffies(msecs)); +} + +/** + * batadv_v_ogm_send_to_if - send a batman ogm using a given interface + * @skb: the OGM to send + * @hard_iface: the interface to use to send the OGM + */ +static void batadv_v_ogm_send_to_if(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + + if (hard_iface->if_status != BATADV_IF_ACTIVE) + return; + + batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); + batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, + skb->len + ETH_HLEN); + + batadv_send_broadcast_skb(skb, hard_iface); +} + +/** + * batadv_v_ogm_send - periodic worker broadcasting the own OGM + * @work: work queue item + */ +static void batadv_v_ogm_send(struct work_struct *work) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_priv_bat_v *bat_v; + struct batadv_priv *bat_priv; + struct batadv_ogm2_packet *ogm_packet; + struct sk_buff *skb, *skb_tmp; + unsigned char *ogm_buff, *pkt_buff; + int ogm_buff_len; + u16 tvlv_len = 0; + + bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); + bat_priv = container_of(bat_v, struct batadv_priv, bat_v); + + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) + goto out; + + ogm_buff = bat_priv->bat_v.ogm_buff; + ogm_buff_len = bat_priv->bat_v.ogm_buff_len; + /* tt changes have to be committed before the tvlv data is + * appended as it may alter the tt tvlv container + */ + batadv_tt_local_commit_changes(bat_priv); + tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, + &ogm_buff_len, + BATADV_OGM2_HLEN); + + bat_priv->bat_v.ogm_buff = ogm_buff; + bat_priv->bat_v.ogm_buff_len = ogm_buff_len; + + skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); + if (!skb) + goto reschedule; + + skb_reserve(skb, ETH_HLEN); + pkt_buff = skb_put(skb, ogm_buff_len); + memcpy(pkt_buff, ogm_buff, ogm_buff_len); + + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); + atomic_inc(&bat_priv->bat_v.ogm_seqno); + ogm_packet->tvlv_len = htons(tvlv_len); + + /* broadcast on every interface */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", + ogm_packet->orig, ntohl(ogm_packet->seqno), + ntohl(ogm_packet->throughput), ogm_packet->ttl, + hard_iface->net_dev->name, + hard_iface->net_dev->dev_addr); + + /* this skb gets consumed by batadv_v_ogm_send_to_if() */ + skb_tmp = skb_clone(skb, GFP_ATOMIC); + if (!skb_tmp) + break; + + batadv_v_ogm_send_to_if(skb_tmp, hard_iface); + } + rcu_read_unlock(); + + consume_skb(skb); + +reschedule: + batadv_v_ogm_start_timer(bat_priv); +out: + return; +} + +/** + * batadv_v_ogm_iface_enable - prepare an interface for B.A.T.M.A.N. V + * @hard_iface: the interface to prepare + * + * Takes care of scheduling own OGM sending routine for this interface. + * + * Return: 0 on success or a negative error code otherwise + */ +int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + + batadv_v_ogm_start_timer(bat_priv); + + return 0; +} + +/** + * batadv_v_ogm_primary_iface_set - set a new primary interface + * @primary_iface: the new primary interface + */ +void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface); + struct batadv_ogm2_packet *ogm_packet; + + if (!bat_priv->bat_v.ogm_buff) + return; + + ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; + ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); +} + +/** + * batadv_v_ogm_orig_update - update the originator status based on the received + * OGM + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: the originator to update + * @neigh_node: the neighbour the OGM has been received from (to update) + * @ogm2: the received OGM + * @if_outgoing: the interface where this OGM is going to be forwarded through + */ +static void +batadv_v_ogm_orig_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + const struct batadv_ogm2_packet *ogm2, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; + struct batadv_neigh_node *router = NULL; + s32 neigh_seq_diff; + u32 neigh_last_seqno; + u32 router_last_seqno; + u32 router_throughput, neigh_throughput; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Searching and updating originator entry of received packet\n"); + + /* if this neighbor already is our next hop there is nothing + * to change + */ + router = batadv_orig_router_get(orig_node, if_outgoing); + if (router == neigh_node) + goto out; + + /* don't consider neighbours with worse throughput. + * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than + * the last received seqno from our best next hop. + */ + if (router) { + router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + + /* if these are not allocated, something is wrong. */ + if (!router_ifinfo || !neigh_ifinfo) + goto out; + + neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; + router_last_seqno = router_ifinfo->bat_v.last_seqno; + neigh_seq_diff = neigh_last_seqno - router_last_seqno; + router_throughput = router_ifinfo->bat_v.throughput; + neigh_throughput = neigh_ifinfo->bat_v.throughput; + + if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && + (router_throughput >= neigh_throughput)) + goto out; + } + + batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); + +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); + if (router) + batadv_neigh_node_put(router); +} + +/** + * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded + * with B.A.T.M.A.N. V OGMs + * @bat_priv: the bat priv with all the soft interface information + * @if_incoming: the interface where the OGM has been received + * @if_outgoing: the interface where the OGM has to be forwarded to + * @throughput: the current throughput + * + * Apply a penalty on the current throughput metric value based on the + * characteristic of the interface where the OGM has been received. The return + * value is computed as follows: + * - throughput * 50% if the incoming and outgoing interface are the + * same WiFi interface and the throughput is above + * 1MBit/s + * - throughput if the outgoing interface is the default + * interface (i.e. this OGM is processed for the + * internal table and not forwarded) + * - throughput * hop penalty otherwise + * + * Return: the penalised throughput metric. + */ +static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, + u32 throughput) +{ + int hop_penalty = atomic_read(&bat_priv->hop_penalty); + int hop_penalty_max = BATADV_TQ_MAX_VALUE; + + /* Don't apply hop penalty in default originator table. */ + if (if_outgoing == BATADV_IF_DEFAULT) + return throughput; + + /* Forwarding on the same WiFi interface cuts the throughput in half + * due to the store & forward characteristics of WIFI. + * Very low throughput values are the exception. + */ + if ((throughput > 10) && + (if_incoming == if_outgoing) && + !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) + return throughput / 2; + + /* hop penalty of 255 equals 100% */ + return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max; +} + +/** + * batadv_v_ogm_forward - forward an OGM to the given outgoing interface + * @bat_priv: the bat priv with all the soft interface information + * @ogm_received: previously received OGM to be forwarded + * @throughput: throughput to announce, may vary per outgoing interface + * @if_incoming: the interface on which this OGM was received on + * @if_outgoing: the interface to which the OGM has to be forwarded to + * + * Forward an OGM to an interface after having altered the throughput metric and + * the TTL value contained in it. The original OGM isn't modified. + */ +static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, + const struct batadv_ogm2_packet *ogm_received, + u32 throughput, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_ogm2_packet *ogm_forward; + unsigned char *skb_buff; + struct sk_buff *skb; + size_t packet_len; + u16 tvlv_len; + + if (ogm_received->ttl <= 1) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); + return; + } + + tvlv_len = ntohs(ogm_received->tvlv_len); + + packet_len = BATADV_OGM2_HLEN + tvlv_len; + skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, + ETH_HLEN + packet_len); + if (!skb) + return; + + skb_reserve(skb, ETH_HLEN); + skb_buff = skb_put(skb, packet_len); + memcpy(skb_buff, ogm_received, packet_len); + + /* apply forward penalty */ + ogm_forward = (struct batadv_ogm2_packet *)skb_buff; + ogm_forward->throughput = htonl(throughput); + ogm_forward->ttl--; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", + if_outgoing->net_dev->name, throughput, ogm_forward->ttl, + if_incoming->net_dev->name); + + batadv_v_ogm_send_to_if(skb, if_outgoing); +} + +/** + * batadv_v_ogm_metric_update - update route metric based on OGM + * @bat_priv: the bat priv with all the soft interface information + * @ogm2: OGM2 structure + * @orig_node: Originator structure for which the OGM has been received + * @neigh_node: the neigh_node through with the OGM has been received + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + * + * Return: + * 1 if the OGM is new, + * 0 if it is not new but valid, + * <0 on error (e.g. old OGM) + */ +static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, + const struct batadv_ogm2_packet *ogm2, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + bool protection_started = false; + int ret = -EINVAL; + u32 path_throughput; + s32 seq_diff; + + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out; + + seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno; + + if (!hlist_empty(&orig_node->neigh_list) && + batadv_window_protected(bat_priv, seq_diff, + BATADV_OGM_MAX_AGE, + &orig_ifinfo->batman_seqno_reset, + &protection_started)) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: packet within window protection time from %pM\n", + ogm2->orig); + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Last reset: %ld, %ld\n", + orig_ifinfo->batman_seqno_reset, jiffies); + goto out; + } + + /* drop packets with old seqnos, however accept the first packet after + * a host has been rebooted. + */ + if ((seq_diff < 0) && !protection_started) + goto out; + + neigh_node->last_seen = jiffies; + + orig_node->last_seen = jiffies; + + orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno); + orig_ifinfo->last_ttl = ogm2->ttl; + + neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; + + path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming, + if_outgoing, + ntohl(ogm2->throughput)); + neigh_ifinfo->bat_v.throughput = path_throughput; + neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno); + neigh_ifinfo->last_ttl = ogm2->ttl; + + if (seq_diff > 0 || protection_started) + ret = 1; + else + ret = 0; +out: + if (orig_ifinfo) + batadv_orig_ifinfo_put(orig_ifinfo); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); + + return ret; +} + +/** + * batadv_v_ogm_route_update - update routes based on OGM + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: the Ethernet header of the OGM2 + * @ogm2: OGM2 structure + * @orig_node: Originator structure for which the OGM has been received + * @neigh_node: the neigh_node through with the OGM has been received + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + */ +static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, + const struct ethhdr *ethhdr, + const struct batadv_ogm2_packet *ogm2, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *router = NULL; + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + struct batadv_orig_node *orig_neigh_node = NULL; + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_node *orig_neigh_router = NULL; + + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; + + orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); + if (!orig_neigh_node) + goto out; + + orig_neigh_router = batadv_orig_router_get(orig_neigh_node, + if_outgoing); + + /* drop packet if sender is not a direct neighbor and if we + * don't route towards it + */ + router = batadv_orig_router_get(orig_node, if_outgoing); + if (router && router->orig_node != orig_node && !orig_neigh_router) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: OGM via unknown neighbor!\n"); + goto out; + } + + if (router) { + batadv_neigh_node_put(router); + router = NULL; + } + + /* Update routes, and check if the OGM is from the best next hop */ + batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2, + if_outgoing); + + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out; + + /* don't forward the same seqno twice on one interface */ + if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno)) + goto out; + + /* acquire possibly updated router */ + router = batadv_orig_router_get(orig_node, if_outgoing); + + /* strict rule: forward packets coming from the best next hop only */ + if (neigh_node != router) + goto out; + + /* only forward for specific interface, not for the default one. */ + if (if_outgoing != BATADV_IF_DEFAULT) { + orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno); + batadv_v_ogm_forward(bat_priv, ogm2, + neigh_ifinfo->bat_v.throughput, + if_incoming, if_outgoing); + } + +out: + if (orig_ifinfo) + batadv_orig_ifinfo_put(orig_ifinfo); + if (router) + batadv_neigh_node_put(router); + if (orig_neigh_router) + batadv_neigh_node_put(orig_neigh_router); + if (orig_neigh_node) + batadv_orig_node_put(orig_neigh_node); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); +} + +/** + * batadv_v_ogm_process_per_outif - process a batman v OGM for an outgoing if + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: the Ethernet header of the OGM2 + * @ogm2: OGM2 structure + * @orig_node: Originator structure for which the OGM has been received + * @neigh_node: the neigh_node through with the OGM has been received + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + */ +static void +batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, + const struct ethhdr *ethhdr, + const struct batadv_ogm2_packet *ogm2, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) +{ + int seqno_age; + + /* first, update the metric with according sanity checks */ + seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, + neigh_node, if_incoming, + if_outgoing); + + /* outdated sequence numbers are to be discarded */ + if (seqno_age < 0) + return; + + /* only unknown & newer OGMs contain TVLVs we are interested in */ + if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT)) + batadv_tvlv_containers_process(bat_priv, true, orig_node, + NULL, NULL, + (unsigned char *)(ogm2 + 1), + ntohs(ogm2->tvlv_len)); + + /* if the metric update went through, update routes if needed */ + batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, + neigh_node, if_incoming, if_outgoing); +} + +/** + * batadv_v_ogm_aggr_packet - checks if there is another OGM aggregated + * @buff_pos: current position in the skb + * @packet_len: total length of the skb + * @tvlv_len: tvlv length of the previously considered OGM + * + * Return: true if there is enough space for another OGM, false otherwise. + */ +static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, + __be16 tvlv_len) +{ + int next_buff_pos = 0; + + next_buff_pos += buff_pos + BATADV_OGM2_HLEN; + next_buff_pos += ntohs(tvlv_len); + + return (next_buff_pos <= packet_len) && + (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); +} + +/** + * batadv_v_ogm_process - process an incoming batman v OGM + * @skb: the skb containing the OGM + * @ogm_offset: offset to the OGM which should be processed (for aggregates) + * @if_incoming: the interface where this packet was receved + */ +static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct ethhdr *ethhdr; + struct batadv_orig_node *orig_node = NULL; + struct batadv_hardif_neigh_node *hardif_neigh = NULL; + struct batadv_neigh_node *neigh_node = NULL; + struct batadv_hard_iface *hard_iface; + struct batadv_ogm2_packet *ogm_packet; + u32 ogm_throughput, link_throughput, path_throughput; + + ethhdr = eth_hdr(skb); + ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); + + ogm_throughput = ntohl(ogm_packet->throughput); + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, troughput %u, TTL %u, V %u, tvlv_len %u)\n", + ethhdr->h_source, if_incoming->net_dev->name, + if_incoming->net_dev->dev_addr, ogm_packet->orig, + ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, + ogm_packet->version, ntohs(ogm_packet->tvlv_len)); + + /* If the troughput metric is 0, immediately drop the packet. No need to + * create orig_node / neigh_node for an unusable route. + */ + if (ogm_throughput == 0) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: originator packet with troughput metric of 0\n"); + return; + } + + /* require ELP packets be to received from this neighbor first */ + hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); + if (!hardif_neigh) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: OGM via unknown neighbor!\n"); + goto out; + } + + orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); + if (!orig_node) + return; + + neigh_node = batadv_neigh_node_new(orig_node, if_incoming, + ethhdr->h_source); + if (!neigh_node) + goto out; + + /* Update the received throughput metric to match the link + * characteristic: + * - If this OGM traveled one hop so far (emitted by single hop + * neighbor) the path throughput metric equals the link throughput. + * - For OGMs traversing more than hop the path throughput metric is + * the smaller of the path throughput and the link throughput. + */ + link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + path_throughput = min_t(u32, link_throughput, ogm_throughput); + ogm_packet->throughput = htonl(path_throughput); + + batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, + neigh_node, if_incoming, + BATADV_IF_DEFAULT); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, + orig_node, neigh_node, + if_incoming, hard_iface); + } + rcu_read_unlock(); +out: + if (orig_node) + batadv_orig_node_put(orig_node); + if (neigh_node) + batadv_neigh_node_put(neigh_node); + if (hardif_neigh) + batadv_hardif_neigh_put(hardif_neigh); +} + +/** + * batadv_v_ogm_packet_recv - OGM2 receiving handler + * @skb: the received OGM + * @if_incoming: the interface where this OGM has been received + * + * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP + * (without freeing the skb) on failure + */ +int batadv_v_ogm_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_ogm2_packet *ogm_packet; + struct ethhdr *ethhdr = eth_hdr(skb); + int ogm_offset; + u8 *packet_pos; + int ret = NET_RX_DROP; + + /* did we receive a OGM2 packet on an interface that does not have + * B.A.T.M.A.N. V enabled ? + */ + if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + return NET_RX_DROP; + + if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) + return NET_RX_DROP; + + if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) + return NET_RX_DROP; + + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + + if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) + return NET_RX_DROP; + + batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); + batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, + skb->len + ETH_HLEN); + + ogm_offset = 0; + ogm_packet = (struct batadv_ogm2_packet *)skb->data; + + while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), + ogm_packet->tvlv_len)) { + batadv_v_ogm_process(skb, ogm_offset, if_incoming); + + ogm_offset += BATADV_OGM2_HLEN; + ogm_offset += ntohs(ogm_packet->tvlv_len); + + packet_pos = skb->data + ogm_offset; + ogm_packet = (struct batadv_ogm2_packet *)packet_pos; + } + + ret = NET_RX_SUCCESS; + consume_skb(skb); + + return ret; +} + +/** + * batadv_v_ogm_init - initialise the OGM2 engine + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or a negative error code in case of failure + */ +int batadv_v_ogm_init(struct batadv_priv *bat_priv) +{ + struct batadv_ogm2_packet *ogm_packet; + unsigned char *ogm_buff; + u32 random_seqno; + + bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN; + ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC); + if (!ogm_buff) + return -ENOMEM; + + bat_priv->bat_v.ogm_buff = ogm_buff; + ogm_packet = (struct batadv_ogm2_packet *)ogm_buff; + ogm_packet->packet_type = BATADV_OGM2; + ogm_packet->version = BATADV_COMPAT_VERSION; + ogm_packet->ttl = BATADV_TTL; + ogm_packet->flags = BATADV_NO_FLAGS; + ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE); + + /* randomize initial seqno to avoid collision */ + get_random_bytes(&random_seqno, sizeof(random_seqno)); + atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); + INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); + + return 0; +} + +/** + * batadv_v_ogm_free - free OGM private resources + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_v_ogm_free(struct batadv_priv *bat_priv) +{ + cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); + + kfree(bat_priv->bat_v.ogm_buff); + bat_priv->bat_v.ogm_buff = NULL; + bat_priv->bat_v.ogm_buff_len = 0; +} diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h new file mode 100644 index 000000000..d849c75ad --- /dev/null +++ b/net/batman-adv/bat_v_ogm.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: + * + * Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _BATMAN_ADV_BATADV_V_OGM_H_ +#define _BATMAN_ADV_BATADV_V_OGM_H_ + +#include <linux/types.h> + +struct batadv_hard_iface; +struct batadv_priv; +struct sk_buff; + +int batadv_v_ogm_init(struct batadv_priv *bat_priv); +void batadv_v_ogm_free(struct batadv_priv *bat_priv); +int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface); +struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, + const u8 *addr); +void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface); +int batadv_v_ogm_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming); + +#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 25cbc36e9..b56bb000a 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -29,10 +29,16 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE); } -/* receive and process one packet within the sequence number window. +/** + * batadv_bit_get_packet - receive and process one packet within the sequence + * number window + * @priv: the bat priv with all the soft interface information + * @seq_bits: pointer to the sequence number receive packet + * @seq_num_diff: difference between the current/received sequence number and + * the last sequence number + * @set_mark: whether this packet should be marked in seq_bits * - * returns: - * 1 if the window was moved (either new or very old) + * Return: 1 if the window was moved (either new or very old), * 0 if the window was not moved/shifted. */ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 0226b220f..3e41bb80e 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -24,7 +24,14 @@ #include <linux/compiler.h> #include <linux/types.h> -/* Returns 1 if the corresponding bit in the given seq_bits indicates true +/** + * batadv_test_bit - check if bit is set in the current window + * + * @seq_bits: pointer to the sequence number receive packet + * @last_seqno: latest sequence number in seq_bits + * @curr_seqno: sequence number to test for + * + * Return: 1 if the corresponding bit in the given seq_bits indicates true * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, @@ -48,9 +55,6 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) set_bit(n, seq_bits); /* turn the position on */ } -/* receive and process one packet, returns 1 if received seq_num is considered - * new, 0 if old - */ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, int set_mark); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index c24c481b6..0a6c8b824 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -31,6 +31,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -58,7 +59,13 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); -/* return the index of the claim */ +/** + * batadv_choose_claim - choose the right bucket for a claim. + * @data: data to hash + * @size: size of the hash table + * + * Return: the hash index of the claim + */ static inline u32 batadv_choose_claim(const void *data, u32 size) { struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; @@ -70,7 +77,13 @@ static inline u32 batadv_choose_claim(const void *data, u32 size) return hash % size; } -/* return the index of the backbone gateway */ +/** + * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway. + * @data: data to hash + * @size: size of the hash table + * + * Return: the hash index of the backbone gateway + */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; @@ -82,7 +95,13 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) return hash % size; } -/* compares address and vid of two backbone gws */ +/** + * batadv_compare_backbone_gw - compare address and vid of two backbone gws + * @node: list node of the first entry to compare + * @data2: pointer to the second backbone gateway + * + * Return: 1 if the backbones have the same data, 0 otherwise + */ static int batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { @@ -100,7 +119,13 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, return 1; } -/* compares address and vid of two claims */ +/** + * batadv_compare_backbone_gw - compare address and vid of two claims + * @node: list node of the first entry to compare + * @data2: pointer to the second claims + * + * Return: 1 if the claim have the same data, 0 otherwise + */ static int batadv_compare_claim(const struct hlist_node *node, const void *data2) { @@ -118,35 +143,62 @@ static int batadv_compare_claim(const struct hlist_node *node, return 1; } -/* free a backbone gw */ -static void -batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) +/** + * batadv_backbone_gw_release - release backbone gw from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the backbone gw + */ +static void batadv_backbone_gw_release(struct kref *ref) { - if (atomic_dec_and_test(&backbone_gw->refcount)) - kfree_rcu(backbone_gw, rcu); + struct batadv_bla_backbone_gw *backbone_gw; + + backbone_gw = container_of(ref, struct batadv_bla_backbone_gw, + refcount); + + kfree_rcu(backbone_gw, rcu); } -/* finally deinitialize the claim */ -static void batadv_claim_release(struct batadv_bla_claim *claim) +/** + * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly + * release it + * @backbone_gw: backbone gateway to be free'd + */ +static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { - batadv_backbone_gw_free_ref(claim->backbone_gw); + kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); +} + +/** + * batadv_claim_release - release claim from lists and queue for free after rcu + * grace period + * @ref: kref pointer of the claim + */ +static void batadv_claim_release(struct kref *ref) +{ + struct batadv_bla_claim *claim; + + claim = container_of(ref, struct batadv_bla_claim, refcount); + + batadv_backbone_gw_put(claim->backbone_gw); kfree_rcu(claim, rcu); } -/* free a claim, call claim_free_rcu if its the last reference */ -static void batadv_claim_free_ref(struct batadv_bla_claim *claim) +/** + * batadv_claim_put - decrement the claim refcounter and possibly + * release it + * @claim: claim to be free'd + */ +static void batadv_claim_put(struct batadv_bla_claim *claim) { - if (atomic_dec_and_test(&claim->refcount)) - batadv_claim_release(claim); + kref_put(&claim->refcount, batadv_claim_release); } /** - * batadv_claim_hash_find + * batadv_claim_hash_find - looks for a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @data: search data (may be local/static data) * - * looks for a claim in the hash, and returns it if found - * or NULL otherwise. + * Return: claim if found or NULL otherwise. */ static struct batadv_bla_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, @@ -169,7 +221,7 @@ static struct batadv_bla_claim if (!batadv_compare_claim(&claim->hash_entry, data)) continue; - if (!atomic_inc_not_zero(&claim->refcount)) + if (!kref_get_unless_zero(&claim->refcount)) continue; claim_tmp = claim; @@ -181,12 +233,12 @@ static struct batadv_bla_claim } /** - * batadv_backbone_hash_find - looks for a claim in the hash + * batadv_backbone_hash_find - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @vid: the VLAN ID * - * Returns claim if found or NULL otherwise. + * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, @@ -213,7 +265,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, &search_entry)) continue; - if (!atomic_inc_not_zero(&backbone_gw->refcount)) + if (!kref_get_unless_zero(&backbone_gw->refcount)) continue; backbone_gw_tmp = backbone_gw; @@ -224,7 +276,10 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, return backbone_gw_tmp; } -/* delete all claims for a backbone */ +/** + * batadv_bla_del_backbone_claims - delete all claims for a backbone + * @backbone_gw: backbone gateway where the claims should be removed + */ static void batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { @@ -249,7 +304,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_free_ref(claim); + batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); @@ -368,18 +423,17 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, netif_rx(skb); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** - * batadv_bla_get_backbone_gw + * batadv_bla_get_backbone_gw - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local * - * searches for the backbone gw or creates a new one if it could not - * be found. + * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, @@ -412,7 +466,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, ether_addr_copy(entry->orig, orig); /* one for the hash, one for returning */ - atomic_set(&entry->refcount, 2); + kref_init(&entry->refcount); + kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, @@ -430,7 +485,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, if (orig_node) { batadv_tt_global_del_orig(bat_priv, orig_node, vid, "became a backbone gateway"); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } if (own_backbone) { @@ -445,7 +500,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, return entry; } -/* update or add the own backbone gw to make sure we announce +/** + * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the selected primary interface + * @vid: VLAN identifier + * + * update or add the own backbone gw to make sure we announce * where we receive other backbone gws */ static void @@ -462,7 +523,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, return; backbone_gw->lasttime = jiffies; - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } /** @@ -511,7 +572,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, /* finally, send an announcement frame */ batadv_bla_send_announce(bat_priv, backbone_gw); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } /** @@ -542,12 +603,9 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) } /** - * batadv_bla_send_announce + * batadv_bla_send_announce - Send an announcement frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: our backbone gateway which should be announced - * - * This function sends an announcement. It is called from multiple - * places. */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) @@ -595,7 +653,8 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, claim->lasttime = jiffies; claim->backbone_gw = backbone_gw; - atomic_set(&claim->refcount, 2); + kref_init(&claim->refcount); + kref_get(&claim->refcount); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", mac, BATADV_PRINT_VID(vid)); @@ -622,10 +681,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&claim->backbone_gw->crc_lock); - batadv_backbone_gw_free_ref(claim->backbone_gw); + batadv_backbone_gw_put(claim->backbone_gw); } /* set (new) backbone gw */ - atomic_inc(&backbone_gw->refcount); + kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; spin_lock_bh(&backbone_gw->crc_lock); @@ -634,11 +693,14 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, backbone_gw->lasttime = jiffies; claim_free_ref: - batadv_claim_free_ref(claim); + batadv_claim_put(claim); } -/* Delete a claim from the claim hash which has the - * given mac address and vid. +/** + * batadv_bla_del_claim - delete a claim from the claim hash + * @bat_priv: the bat priv with all the soft interface information + * @mac: mac address of the claim to be removed + * @vid: VLAN id for the claim to be removed */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) @@ -656,17 +718,25 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); - batadv_claim_free_ref(claim); /* reference from the hash is gone */ + batadv_claim_put(claim); /* reference from the hash is gone */ spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&claim->backbone_gw->crc_lock); /* don't need the reference from hash_find() anymore */ - batadv_claim_free_ref(claim); + batadv_claim_put(claim); } -/* check for ANNOUNCE frame, return 1 if handled */ +/** + * batadv_handle_announce - check for ANNOUNCE frame + * @bat_priv: the bat priv with all the soft interface information + * @an_addr: announcement mac address (ARP Sender HW address) + * @backbone_addr: originator address of the sender (Ethernet source MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { @@ -712,11 +782,20 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, } } - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } -/* check for REQUEST frame, return 1 if handled */ +/** + * batadv_handle_request - check for REQUEST frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: backbone address to be requested (ARP sender HW MAC) + * @ethhdr: ethernet header of a packet + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, struct ethhdr *ethhdr, @@ -740,7 +819,16 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, return 1; } -/* check for UNCLAIM frame, return 1 if handled */ +/** + * batadv_handle_unclaim - check for UNCLAIM frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: originator address of the backbone (Ethernet source) + * @claim_addr: Client to be unclaimed (ARP sender HW MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, u8 *claim_addr, @@ -765,11 +853,20 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } -/* check for CLAIM frame, return 1 if handled */ +/** + * batadv_handle_claim - check for CLAIM frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: originator address of the backbone (Ethernet Source) + * @claim_addr: client mac address to be claimed (ARP sender HW MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, u8 *claim_addr, @@ -793,12 +890,12 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, /* TODO: we could call something like tt_local_del() here. */ - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } /** - * batadv_check_claim_group + * batadv_check_claim_group - check for claim group membership * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header @@ -809,7 +906,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, * This function also applies the group ID of the sender * if it is in the same mesh. * - * returns: + * Return: * 2 - if it is a claim packet and on the same group * 1 - if is a claim packet from another group * 0 - if it is not a claim packet @@ -867,20 +964,18 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, bla_dst_own->group = bla_dst->group; } - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return 2; } /** - * batadv_bla_process_claim + * batadv_bla_process_claim - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked * - * Check if this is a claim frame, and process it accordingly. - * - * returns 1 if it was a claim frame, otherwise return 0 to + * Return: 1 if it was a claim frame, otherwise return 0 to * tell the callee that it can use the frame on its own. */ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, @@ -1011,7 +1106,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, return 1; } -/* Check when we last heard from other nodes, and remove them in case of +/** + * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or + * immediately + * @bat_priv: the bat priv with all the soft interface information + * @now: whether the whole hash shall be wiped now + * + * Check when we last heard from other nodes, and remove them in case of * a time out, or clean all backbone gws if now is set. */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) @@ -1052,14 +1153,14 @@ purge_now: batadv_bla_del_backbone_claims(backbone_gw); hlist_del_rcu(&backbone_gw->hash_entry); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } spin_unlock_bh(list_lock); } } /** - * batadv_bla_purge_claims + * batadv_bla_purge_claims - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now @@ -1108,12 +1209,11 @@ purge_now: } /** - * batadv_bla_update_orig_address + * batadv_bla_update_orig_address - Update the backbone gateways when the own + * originator address changes * @bat_priv: the bat priv with all the soft interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL - * - * Update the backbone gateways when the own orig address changes. */ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, @@ -1181,10 +1281,14 @@ void batadv_bla_status_update(struct net_device *net_dev) * so just call that one. */ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } -/* periodic work to do: +/** + * batadv_bla_periodic_work - performs periodic bla work + * @work: kernel work struct + * + * periodic work to do: * * purge structures when they are too old * * send announcements */ @@ -1251,7 +1355,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); @@ -1265,7 +1369,12 @@ out: static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; -/* initialize all bla structures */ +/** + * batadv_bla_init - initialize all bla structures + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success, < 0 on error. + */ int batadv_bla_init(struct batadv_priv *bat_priv) { int i; @@ -1285,7 +1394,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) if (primary_if) { crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN); bat_priv->bla.claim_dest.group = htons(crc); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } else { bat_priv->bla.claim_dest.group = 0; /* will be set later */ } @@ -1320,7 +1429,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** - * batadv_bla_check_bcast_duplist + * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the bcast_packet to be checked * @@ -1332,6 +1441,8 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. + * + * Return: 1 if a packet is in the duplicate list, 0 otherwise. */ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -1390,14 +1501,13 @@ out: } /** - * batadv_bla_is_backbone_gw_orig + * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for + * the VLAN identified by vid. * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address * @vid: VLAN identifier * - * Check if the originator is a gateway for the VLAN identified by vid. - * - * Returns true if orig is a backbone for this vid, false otherwise. + * Return: true if orig is a backbone for this vid, false otherwise. */ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) @@ -1431,14 +1541,13 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, } /** - * batadv_bla_is_backbone_gw + * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN. * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * - * bla_is_backbone_gw inspects the skb for the VLAN ID and returns 1 - * if the orig_node is also a gateway on the soft interface, otherwise it - * returns 0. + * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise + * it returns 0. */ int batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) @@ -1461,11 +1570,16 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, if (!backbone_gw) return 0; - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } -/* free all bla structures (for softinterface free or module unload) */ +/** + * batadv_bla_init - free all bla structures + * @bat_priv: the bat priv with all the soft interface information + * + * for softinterface free or module unload + */ void batadv_bla_free(struct batadv_priv *bat_priv) { struct batadv_hard_iface *primary_if; @@ -1484,22 +1598,23 @@ void batadv_bla_free(struct batadv_priv *bat_priv) bat_priv->bla.backbone_hash = NULL; } if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** - * batadv_bla_rx + * batadv_bla_rx - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @is_bcast: the packet came in a broadcast packet type. * - * bla_rx avoidance checks if: + * batadv_bla_rx avoidance checks if: * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function and - * returns 1, otherwise it returns 0 and the caller shall further + * in these cases, the skb is further handled by this function + * + * Return: 1 if handled, otherwise it returns 0 and the caller shall further * process the skb. */ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -1576,27 +1691,28 @@ handled: out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (claim) - batadv_claim_free_ref(claim); + batadv_claim_put(claim); return ret; } /** - * batadv_bla_tx + * batadv_bla_tx - check packets going into the mesh * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * - * bla_tx checks if: + * batadv_bla_tx checks if: * * a claim was received which has to be processed * * the frame is allowed on the mesh * - * in these cases, the skb is further handled by this function and - * returns 1, otherwise it returns 0 and the caller shall further - * process the skb. + * in these cases, the skb is further handled by this function. * * This call might reallocate skb data. + * + * Return: 1 if handled, otherwise it returns 0 and the caller shall further + * process the skb. */ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) @@ -1664,12 +1780,19 @@ handled: ret = 1; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (claim) - batadv_claim_free_ref(claim); + batadv_claim_put(claim); return ret; } +/** + * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1715,10 +1838,18 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } +/** + * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq + * file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1772,6 +1903,6 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 7ea199b8b..579f0fa6f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 037ad0a5f..48253cf83 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -281,6 +281,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file) * originator table of an hard interface * @inode: inode pointer to debugfs file * @file: pointer to the seq_file + * + * Return: 0 on success or negative error number in case of failure */ static int batadv_originators_hardif_open(struct inode *inode, struct file *file) @@ -329,6 +331,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode, * batadv_dat_cache_open - Prepare file handler for reads from dat_chache * @inode: inode which was opened * @file: file handle to be initialized + * + * Return: 0 on success or negative error number in case of failure */ static int batadv_dat_cache_open(struct inode *inode, struct file *file) { @@ -483,6 +487,8 @@ void batadv_debugfs_destroy(void) * batadv_debugfs_add_hardif - creates the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which should be added. + * + * Return: 0 on success or negative error number in case of failure */ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 80ab8d6f0..1ab4e2e63 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 5f19133c5..3e6b2624f 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -30,6 +30,7 @@ #include <linux/in.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -62,21 +63,34 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly - * free it - * @dat_entry: the entry to free + * batadv_dat_entry_release - release dat_entry from lists and queue for free + * after rcu grace period + * @ref: kref pointer of the dat_entry */ -static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry) +static void batadv_dat_entry_release(struct kref *ref) { - if (atomic_dec_and_test(&dat_entry->refcount)) - kfree_rcu(dat_entry, rcu); + struct batadv_dat_entry *dat_entry; + + dat_entry = container_of(ref, struct batadv_dat_entry, refcount); + + kfree_rcu(dat_entry, rcu); +} + +/** + * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly + * release it + * @dat_entry: dat_entry to be free'd + */ +static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) +{ + kref_put(&dat_entry->refcount, batadv_dat_entry_release); } /** * batadv_dat_to_purge - check whether a dat_entry has to be purged or not * @dat_entry: the entry to check * - * Returns true if the entry has to be purged now, false otherwise. + * Return: true if the entry has to be purged now, false otherwise. */ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) { @@ -121,7 +135,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, continue; hlist_del_rcu(&dat_entry->hash_entry); - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); } spin_unlock_bh(list_lock); } @@ -151,7 +165,7 @@ static void batadv_dat_purge(struct work_struct *work) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entries are the same, 0 otherwise. + * Return: 1 if the two entries are the same, 0 otherwise. */ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) { @@ -166,7 +180,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_src field in the ARP packet. + * Return: the value of the hw_src field in the ARP packet. */ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { @@ -183,7 +197,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_src field in the ARP packet. + * Return: the value of the ip_src field in the ARP packet. */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { @@ -195,7 +209,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_dst field in the ARP packet. + * Return: the value of the hw_dst field in the ARP packet. */ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { @@ -207,7 +221,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_dst field in the ARP packet. + * Return: the value of the ip_dst field in the ARP packet. */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { @@ -219,7 +233,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data. + * Return: the selected index in the hash table for the given data. */ static u32 batadv_hash_dat(const void *data, u32 size) { @@ -256,7 +270,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) * @ip: search key * @vid: VLAN identifier * - * Returns the dat_entry if found, NULL otherwise. + * Return: the dat_entry if found, NULL otherwise. */ static struct batadv_dat_entry * batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, @@ -281,7 +295,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, if (dat_entry->ip != ip) continue; - if (!atomic_inc_not_zero(&dat_entry->refcount)) + if (!kref_get_unless_zero(&dat_entry->refcount)) continue; dat_entry_tmp = dat_entry; @@ -326,7 +340,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, dat_entry->vid = vid; ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; - atomic_set(&dat_entry->refcount, 2); + kref_init(&dat_entry->refcount); + kref_get(&dat_entry->refcount); hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, batadv_hash_dat, dat_entry, @@ -334,7 +349,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); goto out; } @@ -343,7 +358,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); } #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -440,7 +455,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, * @candidate: orig_node under evaluation * @max_orig_node: last selected candidate * - * Returns true if the node has been elected as next candidate or false + * Return: true if the node has been elected as next candidate or false * otherwise. */ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, @@ -527,12 +542,12 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, max_orig_node)) continue; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) continue; max = tmp_max; if (max_orig_node) - batadv_orig_node_free_ref(max_orig_node); + batadv_orig_node_put(max_orig_node); max_orig_node = orig_node; } rcu_read_unlock(); @@ -559,7 +574,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * closest values (from the LEFT, with wrap around if needed) then the hash * value of the key. ip_dst is the key. * - * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM. + * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, @@ -605,7 +620,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * This function copies the skb with pskb_copy() and is sent as unicast packet * to each of the selected candidates. * - * Returns true if the packet is sent to at least one candidate, false + * Return: true if the packet is sent to at least one candidate, false * otherwise. */ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, @@ -642,9 +657,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, goto free_neigh; } - send_status = batadv_send_skb_packet(tmp_skb, - neigh_node->if_incoming, - neigh_node->addr); + send_status = batadv_send_unicast_skb(tmp_skb, neigh_node); if (send_status == NET_XMIT_SUCCESS) { /* count the sent packet */ switch (packet_subtype) { @@ -662,9 +675,9 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, ret = true; } free_neigh: - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); free_orig: - batadv_orig_node_free_ref(cand[i].orig_node); + batadv_orig_node_put(cand[i].orig_node); } out: @@ -744,6 +757,8 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) /** * batadv_dat_init - initialise the DAT internals * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 in case of success, a negative error code otherwise */ int batadv_dat_init(struct batadv_priv *bat_priv) { @@ -782,6 +797,8 @@ void batadv_dat_free(struct batadv_priv *bat_priv) * batadv_dat_cache_seq_print_text - print the local DAT hash table * @seq: seq file to print on * @offset: not used + * + * Return: always 0 */ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) { @@ -824,7 +841,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -834,7 +851,7 @@ out: * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb * - * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise. + * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ static u16 batadv_arp_get_type(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -907,8 +924,9 @@ out: * @skb: the buffer containing the packet to extract the VID from * @hdr_size: the size of the batman-adv header encapsulating the packet * - * If the packet embedded in the skb is vlan tagged this function returns the - * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned. + * Return: If the packet embedded in the skb is vlan tagged this function + * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS + * is returned. */ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) { @@ -933,7 +951,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * - * Returns true if the message has been sent to the dht candidates, false + * Return: true if the message has been sent to the dht candidates, false * otherwise. In case of a positive return value the message has to be enqueued * to permit the fallback. */ @@ -1012,7 +1030,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, } out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } @@ -1023,7 +1041,7 @@ out: * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the request has been answered, false otherwise. + * Return: true if the request has been answered, false otherwise. */ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -1092,7 +1110,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, } out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); if (ret) kfree_skb(skb); return ret; @@ -1146,7 +1164,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the packet was snooped and consumed by DAT. False if the + * Return: true if the packet was snooped and consumed by DAT. False if the * packet has to be delivered to the interface */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, @@ -1203,7 +1221,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the broadcast packet * - * Returns true if the node can drop the packet, false otherwise. + * Return: true if the node can drop the packet, false otherwise. */ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) @@ -1245,6 +1263,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 26d4a525a..813ecea96 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 20d9282f8..e6956d074 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -85,7 +85,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, /** * batadv_frag_size_limit - maximum possible size of packet to be fragmented * - * Returns the maximum size of payload that can be fragmented. + * Return: the maximum size of payload that can be fragmented. */ static int batadv_frag_size_limit(void) { @@ -107,7 +107,7 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Returns true if chain is empty and caller can just insert the new fragment + * Return: true if chain is empty and caller can just insert the new fragment * without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, @@ -136,7 +136,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, * Insert a new fragment into the reverse ordered chain in the right table * entry. The hash table entry is cleared if "old" fragments exist in it. * - * Returns true if skb is buffered, false on error. If the chain has all the + * Return: true if skb is buffered, false on error. If the chain has all the * fragments needed to merge the packet, the chain is moved to the passed head * to avoid locking the chain in the table. */ @@ -242,12 +242,11 @@ err: /** * batadv_frag_merge_packets - merge a chain of fragments * @chain: head of chain with fragments - * @skb: packet with total size of skb after merging * * Expand the first skb in the chain and copy the content of the remaining * skb's into the expanded one. After doing so, clear the chain. * - * Returns the merged skb or NULL on error. + * Return: the merged skb or NULL on error. */ static struct sk_buff * batadv_frag_merge_packets(struct hlist_head *chain) @@ -307,6 +306,9 @@ free: * There are three possible outcomes: 1) Packet is merged: Return true and * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and leave skb as is. + * + * Return: true when packet is merged or buffered, false when skb is not not + * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, struct batadv_orig_node *orig_node_src) @@ -344,7 +346,7 @@ out_err: * will exceed the MTU towards the next-hop. If so, the fragment is forwarded * without merging it. * - * Returns true if the fragment is consumed/forwarded, false otherwise. + * Return: true if the fragment is consumed/forwarded, false otherwise. */ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_hard_iface *recv_if, @@ -376,16 +378,15 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, skb->len + ETH_HLEN); packet->ttl--; - batadv_send_skb_packet(skb, neigh_node->if_incoming, - neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); ret = true; } out: if (orig_node_dst) - batadv_orig_node_free_ref(orig_node_dst); + batadv_orig_node_put(orig_node_dst); if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -399,7 +400,7 @@ out: * passed mtu and the old one with the rest. The new skb contains data from the * tail of the old skb. * - * Returns the new fragment, NULL on error. + * Return: the new fragment, NULL on error. */ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, struct batadv_frag_packet *frag_head, @@ -433,7 +434,7 @@ err: * @orig_node: final destination of the created fragments * @neigh_node: next-hop of the created fragments * - * Returns true on success, false otherwise. + * Return: true on success, false otherwise. */ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, @@ -484,8 +485,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb_fragment->len + ETH_HLEN); - batadv_send_skb_packet(skb_fragment, neigh_node->if_incoming, - neigh_node->addr); + batadv_send_unicast_skb(skb_fragment, neigh_node); frag_header.no++; /* The initial check in this function should cover this case */ @@ -504,13 +504,13 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb->len + ETH_HLEN); - batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); ret = true; out_err: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return ret; } diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 8b9877e70..9ff77c7ef 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -42,7 +42,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, * batadv_frag_check_entry - check if a list of fragments has timed out * @frags_entry: table entry to check * - * Returns true if the frags entry has timed out, false otherwise. + * Return: true if the frags entry has timed out, false otherwise. */ static inline bool batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index ccf70bed0..c59aff5cc 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -28,6 +28,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rculist.h> @@ -59,12 +60,28 @@ */ #define BATADV_DHCP_CHADDR_OFFSET 28 -static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node) +/** + * batadv_gw_node_release - release gw_node from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the gw_node + */ +static void batadv_gw_node_release(struct kref *ref) { - if (atomic_dec_and_test(&gw_node->refcount)) { - batadv_orig_node_free_ref(gw_node->orig_node); - kfree_rcu(gw_node, rcu); - } + struct batadv_gw_node *gw_node; + + gw_node = container_of(ref, struct batadv_gw_node, refcount); + + batadv_orig_node_put(gw_node->orig_node); + kfree_rcu(gw_node, rcu); +} + +/** + * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it + * @gw_node: gateway node to free + */ +static void batadv_gw_node_put(struct batadv_gw_node *gw_node) +{ + kref_put(&gw_node->refcount, batadv_gw_node_release); } static struct batadv_gw_node * @@ -77,7 +94,7 @@ batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) if (!gw_node) goto out; - if (!atomic_inc_not_zero(&gw_node->refcount)) + if (!kref_get_unless_zero(&gw_node->refcount)) gw_node = NULL; out: @@ -100,14 +117,14 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) if (!orig_node) goto unlock; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; unlock: rcu_read_unlock(); out: if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); return orig_node; } @@ -118,14 +135,14 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->gw.list_lock); - if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount)) + if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount)) new_gw_node = NULL; curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); if (curr_gw_node) - batadv_gw_node_free_ref(curr_gw_node); + batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -170,7 +187,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (!router_ifinfo) goto next; - if (!atomic_inc_not_zero(&gw_node->refcount)) + if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; @@ -186,9 +203,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) ((tmp_gw_factor == max_gw_factor) && (tq_avg > max_tq))) { if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; - atomic_inc(&curr_gw->refcount); + kref_get(&curr_gw->refcount); } break; @@ -201,9 +218,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) */ if (tq_avg > max_tq) { if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; - atomic_inc(&curr_gw->refcount); + kref_get(&curr_gw->refcount); } break; } @@ -214,12 +231,12 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); next: - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); @@ -255,7 +272,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) */ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); } void batadv_gw_election(struct batadv_priv *bat_priv) @@ -330,13 +347,13 @@ void batadv_gw_election(struct batadv_priv *bat_priv) out: if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); if (next_gw) - batadv_gw_node_free_ref(next_gw); + batadv_gw_node_put(next_gw); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } void batadv_gw_check_election(struct batadv_priv *bat_priv, @@ -397,15 +414,15 @@ reselect: batadv_gw_reselect(bat_priv); out: if (curr_gw_orig) - batadv_orig_node_free_ref(curr_gw_orig); + batadv_orig_node_put(curr_gw_orig); if (router_gw) - batadv_neigh_node_free_ref(router_gw); + batadv_neigh_node_put(router_gw); if (router_orig) - batadv_neigh_node_free_ref(router_orig); + batadv_neigh_node_put(router_orig); if (router_gw_tq) - batadv_neigh_ifinfo_free_ref(router_gw_tq); + batadv_neigh_ifinfo_put(router_gw_tq); if (router_orig_tq) - batadv_neigh_ifinfo_free_ref(router_orig_tq); + batadv_neigh_ifinfo_put(router_orig_tq); } /** @@ -423,12 +440,12 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) return; gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); if (!gw_node) { - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return; } @@ -436,7 +453,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - atomic_set(&gw_node->refcount, 1); + kref_init(&gw_node->refcount); spin_lock_bh(&bat_priv->gw.list_lock); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); @@ -456,7 +473,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * - * Returns gateway node if found or NULL otherwise. + * Return: gateway node if found or NULL otherwise. */ static struct batadv_gw_node * batadv_gw_node_get(struct batadv_priv *bat_priv, @@ -469,7 +486,7 @@ batadv_gw_node_get(struct batadv_priv *bat_priv, if (gw_node_tmp->orig_node != orig_node) continue; - if (!atomic_inc_not_zero(&gw_node_tmp->refcount)) + if (!kref_get_unless_zero(&gw_node_tmp->refcount)) continue; gw_node = gw_node_tmp; @@ -529,7 +546,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->gw.list_lock); if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } spin_unlock_bh(&bat_priv->gw.list_lock); @@ -538,12 +555,12 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, batadv_gw_reselect(bat_priv); if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); } out: if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } void batadv_gw_node_delete(struct batadv_priv *bat_priv, @@ -566,7 +583,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { hlist_del_init_rcu(&gw_node->list); - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -603,12 +620,12 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv, ret = seq_has_overflowed(seq) ? -1 : 0; if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); out: if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); return ret; } @@ -645,7 +662,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -656,13 +673,13 @@ out: * @chaddr: buffer where the client address will be stored. Valid * only if the function returns BATADV_DHCP_TO_CLIENT * - * Returns: + * This function may re-allocate the data buffer of the skb passed as argument. + * + * Return: * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error * while parsing it * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client - * - * This function may re-allocate the data buffer of the skb passed as argument. */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, @@ -777,11 +794,11 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, * server. Due to topology changes it may be the case that the GW server * previously selected is not the best one anymore. * - * Returns true if the packet destination is unicast and it is not the best gw, - * false otherwise. - * * This call might reallocate skb data. * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. + * + * Return: true if the packet destination is unicast and it is not the best gw, + * false otherwise. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -839,7 +856,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, goto out; curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; - batadv_neigh_ifinfo_free_ref(curr_ifinfo); + batadv_neigh_ifinfo_put(curr_ifinfo); break; case BATADV_GW_MODE_OFF: @@ -857,18 +874,18 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; - batadv_neigh_ifinfo_free_ref(old_ifinfo); + batadv_neigh_ifinfo_put(old_ifinfo); out: if (orig_dst_node) - batadv_orig_node_free_ref(orig_dst_node); + batadv_orig_node_put(orig_dst_node); if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); if (neigh_old) - batadv_neigh_node_free_ref(neigh_old); + batadv_neigh_node_put(neigh_old); if (neigh_curr) - batadv_neigh_node_free_ref(neigh_curr); + batadv_neigh_node_put(neigh_curr); return out_of_range; } diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index fa9527785..582dd8c41 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b51bface8..442304788 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -38,10 +38,10 @@ * @description: text shown when throughput string cannot be parsed * @throughput: pointer holding the returned throughput information * - * Returns false on parse error and true otherwise. + * Return: false on parse error and true otherwise. */ -static bool batadv_parse_throughput(struct net_device *net_dev, char *buff, - const char *description, u32 *throughput) +bool batadv_parse_throughput(struct net_device *net_dev, char *buff, + const char *description, u32 *throughput) { enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; u64 lthroughput; diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index ab893e318..8a5e1ddf1 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -49,5 +49,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv); void batadv_gw_init(struct batadv_priv *bat_priv); void batadv_gw_free(struct batadv_priv *bat_priv); +bool batadv_parse_throughput(struct net_device *net_dev, char *buff, + const char *description, u32 *throughput); #endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 57f710716..0a7deaf26 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,7 @@ #include "hard-interface.h" #include "main.h" +#include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> @@ -26,6 +27,7 @@ #include <linux/if_ether.h> #include <linux/if.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -47,13 +49,19 @@ #include "sysfs.h" #include "translation-table.h" -void batadv_hardif_free_rcu(struct rcu_head *rcu) +/** + * batadv_hardif_release - release hard interface from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the hard interface + */ +void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; - hard_iface = container_of(rcu, struct batadv_hard_iface, rcu); + hard_iface = container_of(ref, struct batadv_hard_iface, refcount); dev_put(hard_iface->net_dev); - kfree(hard_iface); + + kfree_rcu(hard_iface, rcu); } struct batadv_hard_iface * @@ -64,7 +72,7 @@ batadv_hardif_get_by_netdev(const struct net_device *net_dev) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && - atomic_inc_not_zero(&hard_iface->refcount)) + kref_get_unless_zero(&hard_iface->refcount)) goto out; } @@ -107,7 +115,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * This function recursively checks all the fathers of the device passed as * argument looking for a batman-adv soft interface. * - * Returns true if the device is descendant of a batman-adv mesh interface (or + * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) @@ -161,7 +169,7 @@ static int batadv_is_valid_iface(const struct net_device *net_dev) * interface * @net_device: the device to check * - * Returns true if the net device is a 802.11 wireless device, false otherwise. + * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_netdev(struct net_device *net_device) { @@ -194,7 +202,7 @@ batadv_hardif_get_active(const struct net_device *soft_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && - atomic_inc_not_zero(&hard_iface->refcount)) + kref_get_unless_zero(&hard_iface->refcount)) goto out; } @@ -218,7 +226,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, @@ -228,7 +236,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, ASSERT_RTNL(); - if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount)) + if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount)) new_hard_iface = NULL; curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); @@ -242,7 +250,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, out: if (curr_hard_iface) - batadv_hardif_free_ref(curr_hard_iface); + batadv_hardif_put(curr_hard_iface); } static bool @@ -399,9 +407,12 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); + if (bat_priv->bat_algo_ops->bat_iface_activate) + bat_priv->bat_algo_ops->bat_iface_activate(hard_iface); + out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void @@ -426,7 +437,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * * Invoke ndo_del_slave on master passing slave as argument. In this way slave * is free'd and master can correctly change its internal state. - * Return 0 on success, a negative value representing the error otherwise + * + * Return: 0 on success, a negative value representing the error otherwise */ static int batadv_master_del_slave(struct batadv_hard_iface *slave, struct net_device *master) @@ -455,7 +467,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) goto out; soft_iface = dev_get_by_name(&init_net, iface_name); @@ -553,7 +565,7 @@ err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); err: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -563,8 +575,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; - if (hard_iface->if_status == BATADV_IF_ACTIVE) - batadv_hardif_deactivate_interface(hard_iface); + batadv_hardif_deactivate_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; @@ -584,7 +595,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_primary_if_select(bat_priv, new_if); if (new_if) - batadv_hardif_free_ref(new_if); + batadv_hardif_put(new_if); } bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); @@ -607,11 +618,11 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, } hard_iface->soft_iface = NULL; - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -630,7 +641,7 @@ static void batadv_hardif_remove_interface_finish(struct work_struct *work) batadv_debugfs_del_hardif(hard_iface); batadv_sysfs_del_hardif(&hard_iface->hardif_obj); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); } static struct batadv_hard_iface * @@ -676,7 +687,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; /* extra reference for return */ - atomic_set(&hard_iface->refcount, 2); + kref_init(&hard_iface->refcount); + kref_get(&hard_iface->refcount); batadv_check_known_mac_addr(hard_iface->net_dev); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); @@ -784,10 +796,10 @@ static int batadv_hard_if_event(struct notifier_block *this, } hardif_put: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NOTIFY_DONE; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 7b12ea8ea..d74f1983f 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -20,8 +20,8 @@ #include "main.h" -#include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/kref.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> @@ -61,18 +61,16 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, void batadv_hardif_remove_interfaces(void); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); -void batadv_hardif_free_rcu(struct rcu_head *rcu); +void batadv_hardif_release(struct kref *ref); /** - * batadv_hardif_free_ref - decrement the hard interface refcounter and - * possibly free it + * batadv_hardif_put - decrement the hard interface refcounter and possibly + * release it * @hard_iface: the hard interface to free */ -static inline void -batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) +static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { - if (atomic_dec_and_test(&hard_iface->refcount)) - call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); + kref_put(&hard_iface->refcount, batadv_hardif_release); } static inline struct batadv_hard_iface * @@ -85,7 +83,7 @@ batadv_primary_if_get_selected(struct batadv_priv *bat_priv) if (!hard_iface) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) hard_iface = NULL; out: diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 2ea6a18d7..a0a0fdb85 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 377626250..9bb57b874 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -30,14 +30,17 @@ struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their - * keys, return 0 if same and not 0 if not same + * keys + * + * Return: 0 if same and not 0 if not same */ typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, const void *); -/* the hashfunction, should return an index - * based on the key in the data of the first - * argument and the size the second +/* the hashfunction + * + * Return: an index based on the key in the data of the first argument and the + * size the second */ typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); @@ -96,7 +99,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, * @data: data passed to the aforementioned callbacks as argument * @data_node: to be added element * - * Returns 0 on success, 1 if the element already is in the hash + * Return: 0 on success, 1 if the element already is in the hash * and -1 on error. */ static inline int batadv_hash_add(struct batadv_hashtable *hash, @@ -139,10 +142,11 @@ out: return ret; } -/* removes data from hash, if found. returns pointer do data on success, so you - * can remove the used structure yourself, or NULL on error . data could be the - * structure you use with just the key filled, we just need the key for - * comparing. +/* removes data from hash, if found. data could be the structure you use with + * just the key filled, we just need the key for comparing. + * + * Return: returns pointer do data on success, so you can remove the used + * structure yourself, or NULL on error */ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index bcabb5e3f..14d0013b3 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -278,7 +278,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr); - batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); goto out; dst_unreach: @@ -288,11 +288,11 @@ free_skb: kfree_skb(skb); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return len; } diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index e937143f0..618d5de06 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4b5d61fba..d64ddb961 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -29,6 +29,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/module.h> @@ -86,6 +87,7 @@ static int __init batadv_init(void) batadv_recv_handler_init(); + batadv_v_init(); batadv_iv_init(); batadv_nc_init(); @@ -158,6 +160,10 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); + ret = batadv_v_mesh_init(bat_priv); + if (ret < 0) + goto err; + ret = batadv_originator_init(bat_priv); if (ret < 0) goto err; @@ -200,6 +206,8 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_purge_outstanding_packets(bat_priv, NULL); batadv_gw_node_free(bat_priv); + + batadv_v_mesh_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); @@ -233,7 +241,7 @@ void batadv_mesh_free(struct net_device *soft_iface) * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check * - * Returns 'true' if the mac address was found, false otherwise. + * Return: 'true' if the mac address was found, false otherwise. */ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { @@ -262,7 +270,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) * function that requires the primary interface * @seq: debugfs table seq_file struct * - * Returns primary interface if found or NULL otherwise. + * Return: primary interface if found or NULL otherwise. */ struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq) @@ -286,7 +294,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq) seq_printf(seq, "BATMAN mesh %s disabled - primary interface not active\n", net_dev->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); primary_if = NULL; out: @@ -297,7 +305,7 @@ out: * batadv_max_header_len - calculate maximum encapsulation overhead for a * payload packet * - * Return the maximum encapsulation overhead in bytes. + * Return: the maximum encapsulation overhead in bytes. */ int batadv_max_header_len(void) { @@ -599,6 +607,8 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. + * + * Return: big endian crc32c of the checksummed data */ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { @@ -622,15 +632,26 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) } /** - * batadv_tvlv_handler_free_ref - decrement the tvlv handler refcounter and - * possibly free it + * batadv_tvlv_handler_release - release tvlv handler from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_handler_release(struct kref *ref) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); + kfree_rcu(tvlv_handler, rcu); +} + +/** + * batadv_tvlv_handler_put - decrement the tvlv container refcounter and + * possibly release it * @tvlv_handler: the tvlv handler to free */ -static void -batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) +static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { - if (atomic_dec_and_test(&tvlv_handler->refcount)) - kfree_rcu(tvlv_handler, rcu); + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } /** @@ -640,7 +661,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) * @type: tvlv handler type to look for * @version: tvlv handler version to look for * - * Returns tvlv handler if found or NULL otherwise. + * Return: tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler *batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) @@ -656,7 +677,7 @@ static struct batadv_tvlv_handler if (tvlv_handler_tmp->version != version) continue; - if (!atomic_inc_not_zero(&tvlv_handler_tmp->refcount)) + if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) continue; tvlv_handler = tvlv_handler_tmp; @@ -668,14 +689,25 @@ static struct batadv_tvlv_handler } /** - * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and - * possibly free it + * batadv_tvlv_container_release - release tvlv from lists and free + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_container_release(struct kref *ref) +{ + struct batadv_tvlv_container *tvlv; + + tvlv = container_of(ref, struct batadv_tvlv_container, refcount); + kfree(tvlv); +} + +/** + * batadv_tvlv_container_put - decrement the tvlv container refcounter and + * possibly release it * @tvlv: the tvlv container to free */ -static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) +static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { - if (atomic_dec_and_test(&tvlv->refcount)) - kfree(tvlv); + kref_put(&tvlv->refcount, batadv_tvlv_container_release); } /** @@ -688,13 +720,15 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * - * Returns tvlv container if found or NULL otherwise. + * Return: tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container *batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { if (tvlv_tmp->tvlv_hdr.type != type) continue; @@ -702,7 +736,7 @@ static struct batadv_tvlv_container if (tvlv_tmp->tvlv_hdr.version != version) continue; - if (!atomic_inc_not_zero(&tvlv_tmp->refcount)) + if (!kref_get_unless_zero(&tvlv_tmp->refcount)) continue; tvlv = tvlv_tmp; @@ -720,13 +754,15 @@ static struct batadv_tvlv_container * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * - * Returns size of all currently registered tvlv containers in bytes. + * Return: size of all currently registered tvlv containers in bytes. */ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; u16 tvlv_len = 0; + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); tvlv_len += ntohs(tvlv->tvlv_hdr.len); @@ -755,8 +791,8 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, hlist_del(&tvlv->list); /* first call to decrement the counter, second call to free */ - batadv_tvlv_container_free_ref(tvlv); - batadv_tvlv_container_free_ref(tvlv); + batadv_tvlv_container_put(tvlv); + batadv_tvlv_container_put(tvlv); } /** @@ -808,7 +844,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); INIT_HLIST_NODE(&tvlv_new->list); - atomic_set(&tvlv_new->refcount, 1); + kref_init(&tvlv_new->refcount); spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); @@ -826,7 +862,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, * @additional_packet_len: requested additional packet size on top of minimum * size * - * Returns true of the packet buffer could be changed to the requested size, + * Return: true of the packet buffer could be changed to the requested size, * false otherwise. */ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, @@ -862,7 +898,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * - * Returns size of all appended tvlv containers in bytes. + * Return: size of all appended tvlv containers in bytes. */ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, @@ -915,7 +951,7 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Returns success if handler was not found or the return value of the handler + * Return: success if handler was not found or the return value of the handler * callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, @@ -968,7 +1004,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Returns success when processing an OGM or the return value of all called + * Return: success when processing an OGM or the return value of all called * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, @@ -1001,7 +1037,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, src, dst, tvlv_value, tvlv_value_cont_len); if (tvlv_handler) - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } @@ -1081,7 +1117,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); return; } @@ -1094,7 +1130,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; - atomic_set(&tvlv_handler->refcount, 1); + kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); @@ -1118,11 +1154,11 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, if (!tvlv_handler) return; - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_del_rcu(&tvlv_handler->list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); } /** @@ -1182,7 +1218,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) kfree_skb(skb); out: - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } /** @@ -1190,8 +1226,8 @@ out: * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * - * If the packet embedded in the skb is vlan tagged this function returns the - * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned. + * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the + * skb is vlan tagged. Otherwise BATADV_NO_FLAGS. */ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) { @@ -1218,7 +1254,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Returns true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN idenfied by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) @@ -1232,7 +1268,7 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } return ap_isolation_enabled; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 9dbd9107e..db4533631 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -24,17 +24,21 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.0" +#define BATADV_SOURCE_VERSION "2016.1" #endif /* B.A.T.M.A.N. parameters */ #define BATADV_TQ_MAX_VALUE 255 +#define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF #define BATADV_JITTER 20 /* Time To Live of broadcast messages */ #define BATADV_TTL 50 +/* maximum sequence number age of broadcast messages */ +#define BATADV_BCAST_MAX_AGE 64 + /* purge originators after time in seconds if no valid packet comes in * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ @@ -57,6 +61,15 @@ #define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1 +/* B.A.T.M.A.N. V */ +#define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */ +#define BATADV_ELP_PROBES_PER_NODE 2 +#define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */ +#define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */ +#define BATADV_ELP_MAX_AGE 64 +#define BATADV_OGM_MAX_ORIGDIFF 5 +#define BATADV_OGM_MAX_AGE 64 + /* number of OGMs sent with the last tt diff */ #define BATADV_TT_OGM_APPEND_MAX 3 @@ -97,11 +110,6 @@ */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 -/* how much worse secondary interfaces may be to be considered as bonding - * candidates - */ -#define BATADV_BONDING_TQ_THRESHOLD 50 - /* should not be bigger than 512 bytes or change the size of * forw_packet->direct_link_flags */ @@ -273,9 +281,14 @@ static inline void _batadv_dbg(int type __always_unused, pr_err("%s: " fmt, _netdev->name, ## arg); \ } while (0) -/* returns 1 if they are the same ethernet addr +/** + * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses + * @data1: Pointer to a six-byte array containing the Ethernet address + * @data2: Pointer other six-byte array containing the Ethernet address * * note: can't use ether_addr_equal() as it requires aligned memory + * + * Return: 1 if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { @@ -287,7 +300,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2) * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * - * Returns true if current time is after timestamp + timeout + * Return: true if current time is after timestamp + timeout */ static inline bool batadv_has_timed_out(unsigned long timestamp, unsigned int timeout) @@ -326,7 +339,13 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) -/* Sum and return the cpu-local counters for index 'idx' */ +/** + * batadv_sum_counter - Sum the cpu-local counters for index 'idx' + * @bat_priv: the bat priv with all the soft interface information + * @idx: index of counter to sum up + * + * Return: sum of all cpu-local counters + */ static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 75fa5013a..8caa2c72e 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -30,6 +30,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -55,7 +56,7 @@ * Collect multicast addresses of the local multicast listeners * on the given soft interface, dev, in the given mcast_list. * - * Returns -ENOMEM on memory allocation error or the number of + * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get(struct net_device *dev, @@ -87,7 +88,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * - * Returns true if the given address is already in the given list. + * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, @@ -195,8 +196,9 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, * batadv_mcast_has_bridge - check whether the soft-iface is bridged * @bat_priv: the bat priv with all the soft interface information * - * Checks whether there is a bridge on top of our soft interface. Returns - * true if so, false otherwise. + * Checks whether there is a bridge on top of our soft interface. + * + * Return: true if there is a bridge, false otherwise. */ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) { @@ -218,7 +220,7 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. * - * Returns true if the tvlv container is registered afterwards. Otherwise + * Return: true if the tvlv container is registered afterwards. Otherwise * returns false. */ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) @@ -289,8 +291,8 @@ out: * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of - * memory allocation failure. + * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory + * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -327,8 +329,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out - * of memory. + * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -366,8 +367,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out - * of memory. + * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -398,7 +398,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * - * Returns the number of nodes which want all IPv4 multicast traffic if the + * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ @@ -421,7 +421,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination * - * Returns an orig_node matching the multicast address provided by ethhdr + * Return: an orig_node matching the multicast address provided by ethhdr * via a translation table lookup. This increases the returned nodes refcount. */ static struct batadv_orig_node * @@ -436,7 +436,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and * increases its refcount. */ static struct batadv_orig_node * @@ -448,7 +448,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -463,7 +463,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set * and increases its refcount. */ static struct batadv_orig_node * @@ -475,7 +475,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -491,7 +491,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and * increases its refcount. */ @@ -514,7 +514,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag * set and increases its refcount. */ static struct batadv_orig_node * @@ -526,7 +526,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_unsnoopables_list, mcast_want_all_unsnoopables_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -543,7 +543,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) * @skb: The multicast packet to check * @orig: an originator to be set to forward the skb to * - * Returns the forwarding mode as enum batadv_forw_mode and in case of + * Return: the forwarding mode as enum batadv_forw_mode and in case of * BATADV_FORW_SINGLE set the orig to the single originator the skb * should be forwarded to. */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 8f3cb04b9..80bceec55 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -23,7 +23,7 @@ struct sk_buff; /** - * batadv_forw_mode - the way a packet should be forwarded as + * enum batadv_forw_mode - the way a packet should be forwarded as * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic * flooding) * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index cc63b44f0..b41719b64 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -32,6 +32,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -64,6 +65,8 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, /** * batadv_nc_init - one-time initialization for network coding + * + * Return: 0 on success or negative error number in case of failure */ int __init batadv_nc_init(void) { @@ -142,6 +145,8 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_nc_mesh_init - initialise coding hash table and start house keeping * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure */ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) { @@ -205,34 +210,50 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) /** * batadv_nc_node_release - release nc_node from lists and queue for free after * rcu grace period - * @nc_node: the nc node to free + * @ref: kref pointer of the nc_node */ -static void batadv_nc_node_release(struct batadv_nc_node *nc_node) +static void batadv_nc_node_release(struct kref *ref) { - batadv_orig_node_free_ref(nc_node->orig_node); + struct batadv_nc_node *nc_node; + + nc_node = container_of(ref, struct batadv_nc_node, refcount); + + batadv_orig_node_put(nc_node->orig_node); kfree_rcu(nc_node, rcu); } /** - * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly + * batadv_nc_node_put - decrement the nc_node refcounter and possibly * release it - * @nc_node: the nc node to free + * @nc_node: nc_node to be free'd */ -static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) +static void batadv_nc_node_put(struct batadv_nc_node *nc_node) { - if (atomic_dec_and_test(&nc_node->refcount)) - batadv_nc_node_release(nc_node); + kref_put(&nc_node->refcount, batadv_nc_node_release); } /** - * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly - * frees it - * @nc_path: the nc node to free + * batadv_nc_path_release - release nc_path from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the nc_path */ -static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) +static void batadv_nc_path_release(struct kref *ref) { - if (atomic_dec_and_test(&nc_path->refcount)) - kfree_rcu(nc_path, rcu); + struct batadv_nc_path *nc_path; + + nc_path = container_of(ref, struct batadv_nc_path, refcount); + + kfree_rcu(nc_path, rcu); +} + +/** + * batadv_nc_path_put - decrement the nc_path refcounter and possibly + * release it + * @nc_path: nc_path to be free'd + */ +static void batadv_nc_path_put(struct batadv_nc_path *nc_path) +{ + kref_put(&nc_path->refcount, batadv_nc_path_release); } /** @@ -242,7 +263,7 @@ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) { kfree_skb(nc_packet->skb); - batadv_nc_path_free_ref(nc_packet->nc_path); + batadv_nc_path_put(nc_packet->nc_path); kfree(nc_packet); } @@ -251,7 +272,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) * @bat_priv: the bat priv with all the soft interface information * @nc_node: the nc node to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, struct batadv_nc_node *nc_node) @@ -267,7 +288,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) @@ -287,7 +308,7 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) @@ -335,7 +356,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, "Removing nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); list_del_rcu(&nc_node->list); - batadv_nc_node_free_ref(nc_node); + batadv_nc_node_put(nc_node); } spin_unlock_bh(lock); } @@ -446,7 +467,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, "Remove nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); hlist_del_rcu(&nc_path->hash_entry); - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); } spin_unlock_bh(lock); } @@ -470,7 +491,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data. + * Return: the selected index in the hash table for the given data. */ static u32 batadv_nc_hash_choose(const void *data, u32 size) { @@ -489,7 +510,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entry are the same, 0 otherwise + * Return: 1 if the two entry are the same, 0 otherwise */ static int batadv_nc_hash_compare(const struct hlist_node *node, const void *data2) @@ -516,7 +537,7 @@ static int batadv_nc_hash_compare(const struct hlist_node *node, * @hash: hash table containing the nc path * @data: search key * - * Returns the nc_path if found, NULL otherwise. + * Return: the nc_path if found, NULL otherwise. */ static struct batadv_nc_path * batadv_nc_hash_find(struct batadv_hashtable *hash, @@ -537,7 +558,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash, if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) continue; - if (!atomic_inc_not_zero(&nc_path->refcount)) + if (!kref_get_unless_zero(&nc_path->refcount)) continue; nc_path_tmp = nc_path; @@ -554,9 +575,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash, */ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) { - batadv_send_skb_packet(nc_packet->skb, - nc_packet->neigh_node->if_incoming, - nc_packet->nc_path->next_hop); + batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); nc_packet->skb = NULL; batadv_nc_packet_free(nc_packet); } @@ -571,7 +590,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) * timeout. If so, the packet is no longer kept and the entry deleted from the * queue. Has to be called with the appropriate locks. * - * Returns false as soon as the entry in the fifo queue has not been timed out + * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, @@ -610,7 +629,7 @@ out: * packet is no longer delayed, immediately sent and the entry deleted from the * queue. Has to be called with the appropriate locks. * - * Returns false as soon as the entry in the fifo queue has not been timed out + * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, @@ -731,7 +750,7 @@ static void batadv_nc_worker(struct work_struct *work) * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks * - * Returns true if: + * Return: true if: * 1) The OGM must have the most recent sequence number. * 2) The TTL must be decremented by one and only one. * 3) The OGM must be received from the first hop from orig_node. @@ -751,7 +770,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, last_ttl = orig_ifinfo->last_ttl; last_real_seqno = orig_ifinfo->last_real_seqno; - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); if (last_real_seqno != ntohl(ogm_packet->seqno)) return false; @@ -772,7 +791,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * - * Returns the nc_node if found, NULL otherwise. + * Return: the nc_node if found, NULL otherwise. */ static struct batadv_nc_node *batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, @@ -793,7 +812,7 @@ static struct batadv_nc_node if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) continue; - if (!atomic_inc_not_zero(&nc_node->refcount)) + if (!kref_get_unless_zero(&nc_node->refcount)) continue; /* Found a match */ @@ -814,7 +833,7 @@ static struct batadv_nc_node * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * - * Returns the nc_node if found or created, NULL in case of an error. + * Return: the nc_node if found or created, NULL in case of an error. */ static struct batadv_nc_node *batadv_nc_get_nc_node(struct batadv_priv *bat_priv, @@ -837,14 +856,15 @@ static struct batadv_nc_node if (!nc_node) return NULL; - if (!atomic_inc_not_zero(&orig_neigh_node->refcount)) + if (!kref_get_unless_zero(&orig_neigh_node->refcount)) goto free; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); ether_addr_copy(nc_node->addr, orig_node->orig); nc_node->orig_node = orig_neigh_node; - atomic_set(&nc_node->refcount, 2); + kref_init(&nc_node->refcount); + kref_get(&nc_node->refcount); /* Select ingoing or outgoing coding node */ if (in_coding) { @@ -920,9 +940,9 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, out: if (in_nc_node) - batadv_nc_node_free_ref(in_nc_node); + batadv_nc_node_put(in_nc_node); if (out_nc_node) - batadv_nc_node_free_ref(out_nc_node); + batadv_nc_node_put(out_nc_node); } /** @@ -932,7 +952,7 @@ out: * @src: ethernet source address - first half of the nc path search key * @dst: ethernet destination address - second half of the nc path search key * - * Returns pointer to nc_path if the path was found or created, returns NULL + * Return: pointer to nc_path if the path was found or created, returns NULL * on error. */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, @@ -963,7 +983,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, /* Initialize nc_path */ INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); - atomic_set(&nc_path->refcount, 2); + kref_init(&nc_path->refcount); + kref_get(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); @@ -989,6 +1010,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value + * + * Return: scaled tq value */ static u8 batadv_nc_random_weight_tq(u8 tq) { @@ -1029,7 +1052,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) * @nc_packet: structure containing the packet to the skb can be coded with * @neigh_node: next hop to forward packet to * - * Returns true if both packets are consumed, false otherwise. + * Return: true if both packets are consumed, false otherwise. */ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -1042,11 +1065,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_unicast_packet *packet1; struct batadv_unicast_packet *packet2; struct batadv_coded_packet *coded_packet; - struct batadv_neigh_node *neigh_tmp, *router_neigh; - struct batadv_neigh_node *router_coding = NULL; + struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; + struct batadv_neigh_node *router_coding = NULL, *second_dest; struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - u8 *first_source, *first_dest, *second_source, *second_dest; + u8 *first_source, *second_source; __be32 packet_id1, packet_id2; size_t count; bool res = false; @@ -1089,9 +1112,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, */ if (tq_weighted_neigh >= tq_weighted_coding) { /* Destination from nc_packet is selected for MAC-header */ - first_dest = nc_packet->nc_path->next_hop; + first_dest = nc_packet->neigh_node; first_source = nc_packet->nc_path->prev_hop; - second_dest = neigh_node->addr; + second_dest = neigh_node; second_source = ethhdr->h_source; packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; packet2 = (struct batadv_unicast_packet *)skb->data; @@ -1100,9 +1123,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, skb->data + sizeof(*packet2)); } else { /* Destination for skb is selected for MAC-header */ - first_dest = neigh_node->addr; + first_dest = neigh_node; first_source = ethhdr->h_source; - second_dest = nc_packet->nc_path->next_hop; + second_dest = nc_packet->neigh_node; second_source = nc_packet->nc_path->prev_hop; packet1 = (struct batadv_unicast_packet *)skb->data; packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; @@ -1144,7 +1167,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, coded_packet->first_ttvn = packet1->ttvn; /* Info about second unicast packet */ - ether_addr_copy(coded_packet->second_dest, second_dest); + ether_addr_copy(coded_packet->second_dest, second_dest->addr); ether_addr_copy(coded_packet->second_source, second_source); ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); coded_packet->second_crc = packet_id2; @@ -1199,17 +1222,17 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, batadv_nc_packet_free(nc_packet); /* Send the coded packet and return true */ - batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest); + batadv_send_unicast_skb(skb_dest, first_dest); res = true; out: if (router_neigh) - batadv_neigh_node_free_ref(router_neigh); + batadv_neigh_node_put(router_neigh); if (router_coding) - batadv_neigh_node_free_ref(router_coding); + batadv_neigh_node_put(router_coding); if (router_neigh_ifinfo) - batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo); + batadv_neigh_ifinfo_put(router_neigh_ifinfo); if (router_coding_ifinfo) - batadv_neigh_ifinfo_free_ref(router_coding_ifinfo); + batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } @@ -1228,7 +1251,7 @@ out: * Since the source encoded the packet we can be certain it has all necessary * decode information. * - * Returns true if coding of a decoded packet is allowed. + * Return: true if coding of a decoded packet is allowed. */ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { @@ -1246,7 +1269,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) * @skb: data skb to forward * @eth_dst: next hop mac address of skb * - * Returns true if coding of a decoded skb is allowed. + * Return: true if coding of a decoded skb is allowed. */ static struct batadv_nc_packet * batadv_nc_path_search(struct batadv_priv *bat_priv, @@ -1314,7 +1337,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, * @eth_src: source mac address of skb * @in_nc_node: pointer to skb next hop's neighbor nc node * - * Returns an nc packet if a suitable coding packet was found, NULL otherwise. + * Return: an nc packet if a suitable coding packet was found, NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, @@ -1347,7 +1370,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, } rcu_read_unlock(); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return nc_packet; } @@ -1397,7 +1420,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * next hop that potentially sent a packet which our next hop also received * (overheard) and has stored for later decoding. * - * Returns true if the skb was consumed (encoded packet sent) or false otherwise + * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, struct batadv_neigh_node *neigh_node, @@ -1451,7 +1474,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, * @neigh_node: next hop to forward packet to * @packet_id: checksum to identify packet * - * Returns true if the packet was buffered or false in case of an error. + * Return: true if the packet was buffered or false in case of an error. */ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, struct batadv_nc_path *nc_path, @@ -1485,7 +1508,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, * @skb: data skb to forward * @neigh_node: next hop to forward packet to * - * Returns true if the skb was consumed (encoded packet sent) or false otherwise + * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ bool batadv_nc_skb_forward(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) @@ -1530,7 +1553,7 @@ bool batadv_nc_skb_forward(struct sk_buff *skb, return true; free_nc_path: - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); out: /* Packet is not consumed */ return false; @@ -1592,7 +1615,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, free_skb: kfree_skb(skb); free_nc_path: - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); out: return; } @@ -1624,7 +1647,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, * @skb: unicast skb to decode * @nc_packet: decode data needed to decode the skb * - * Returns pointer to decoded unicast packet if the packet was decoded or NULL + * Return: pointer to decoded unicast packet if the packet was decoded or NULL * in case of an error. */ static struct batadv_unicast_packet * @@ -1718,7 +1741,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, * @ethhdr: pointer to the ethernet header inside the coded packet * @coded: coded packet we try to find decode data for * - * Returns pointer to nc packet if the needed data was found or NULL otherwise. + * Return: pointer to nc packet if the needed data was found or NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, @@ -1781,6 +1804,9 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, * resulting unicast packet * @skb: incoming coded packet * @recv_if: pointer to interface this packet was received on + * + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * otherwise. */ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) @@ -1865,6 +1891,8 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) * batadv_nc_nodes_seq_print_text - print the nc node information * @seq: seq file to print on * @offset: not used + * + * Return: always 0 */ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) { @@ -1920,13 +1948,15 @@ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } /** * batadv_nc_init_debugfs - create nc folder and related files in debugfs * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure */ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 8f6d4ad87..d6d7fb4ec 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index fe578f75c..c355a8247 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,11 +18,13 @@ #include "originator.h" #include "main.h" +#include <linux/atomic.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -47,7 +49,13 @@ static struct lock_class_key batadv_orig_hash_lock_class_key; static void batadv_purge_orig(struct work_struct *work); -/* returns 1 if they are the same originator */ +/** + * batadv_compare_orig - comparing function used in the originator hash table + * @node: node in the local table + * @data2: second object to compare the node to + * + * Return: 1 if they are the same originator + */ int batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, @@ -61,7 +69,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2) * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * - * Returns the vlan object identified by vid and belonging to orig_node or NULL + * Return: the vlan object identified by vid and belonging to orig_node or NULL * if it does not exist. */ struct batadv_orig_node_vlan * @@ -75,7 +83,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, if (tmp->vid != vid) continue; - if (!atomic_inc_not_zero(&tmp->refcount)) + if (!kref_get_unless_zero(&tmp->refcount)) continue; vlan = tmp; @@ -93,7 +101,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * - * Returns NULL in case of failure or the vlan object identified by vid and + * Return: NULL in case of failure or the vlan object identified by vid and * belonging to orig_node otherwise. The object is created and added to the list * if it does not exist. * @@ -116,7 +124,8 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, if (!vlan) goto out; - atomic_set(&vlan->refcount, 2); + kref_init(&vlan->refcount); + kref_get(&vlan->refcount); vlan->vid = vid; hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); @@ -128,14 +137,27 @@ out: } /** - * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly free + * batadv_orig_node_vlan_release - release originator-vlan object from lists + * and queue for free after rcu grace period + * @ref: kref pointer of the originator-vlan object + */ +static void batadv_orig_node_vlan_release(struct kref *ref) +{ + struct batadv_orig_node_vlan *orig_vlan; + + orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount); + + kfree_rcu(orig_vlan, rcu); +} + +/** + * batadv_orig_node_vlan_put - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ -void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan) +void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { - if (atomic_dec_and_test(&orig_vlan->refcount)) - kfree_rcu(orig_vlan, rcu); + kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } int batadv_originator_init(struct batadv_priv *bat_priv) @@ -165,99 +187,98 @@ err: /** * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for * free after rcu grace period - * @neigh_ifinfo: the neigh_ifinfo object to release + * @ref: kref pointer of the neigh_ifinfo */ -static void -batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo) +static void batadv_neigh_ifinfo_release(struct kref *ref) { + struct batadv_neigh_ifinfo *neigh_ifinfo; + + neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount); + if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref(neigh_ifinfo->if_outgoing); + batadv_hardif_put(neigh_ifinfo->if_outgoing); kfree_rcu(neigh_ifinfo, rcu); } /** - * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release + * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) +void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - batadv_neigh_ifinfo_release(neigh_ifinfo); + kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** * batadv_hardif_neigh_release - release hardif neigh node from lists and * queue for free after rcu grace period - * @hardif_neigh: hardif neigh neighbor to free + * @ref: kref pointer of the neigh_node */ -static void -batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh) +static void batadv_hardif_neigh_release(struct kref *ref) { + struct batadv_hardif_neigh_node *hardif_neigh; + + hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node, + refcount); + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - batadv_hardif_free_ref(hardif_neigh->if_incoming); + batadv_hardif_put(hardif_neigh->if_incoming); kfree_rcu(hardif_neigh, rcu); } /** - * batadv_hardif_neigh_free_ref - decrement the hardif neighbors refcounter + * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ -void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) +void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) - batadv_hardif_neigh_release(hardif_neigh); + kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** * batadv_neigh_node_release - release neigh_node from lists and queue for * free after rcu grace period - * @neigh_node: neigh neighbor to free + * @ref: kref pointer of the neigh_node */ -static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node) +static void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; - struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; + neigh_node = container_of(ref, struct batadv_neigh_node, refcount); bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } - hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, - neigh_node->addr); - if (hardif_neigh) { - /* batadv_hardif_neigh_get() increases refcount too */ - batadv_hardif_neigh_free_ref(hardif_neigh); - batadv_hardif_neigh_free_ref(hardif_neigh); - } + batadv_hardif_neigh_put(neigh_node->hardif_neigh); if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); - batadv_hardif_free_ref(neigh_node->if_incoming); + batadv_hardif_put(neigh_node->if_incoming); kfree_rcu(neigh_node, rcu); } /** - * batadv_neigh_node_free_ref - decrement the neighbors refcounter - * and possibly release it + * batadv_neigh_node_put - decrement the neighbors refcounter and possibly + * release it * @neigh_node: neigh neighbor to free */ -void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) +void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { - if (atomic_dec_and_test(&neigh_node->refcount)) - batadv_neigh_node_release(neigh_node); + kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** @@ -266,7 +287,7 @@ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Returns the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -286,7 +307,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, break; } - if (router && !atomic_inc_not_zero(&router->refcount)) + if (router && !kref_get_unless_zero(&router->refcount)) router = NULL; rcu_read_unlock(); @@ -298,7 +319,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns the requested orig_ifinfo or NULL if not found. + * Return: the requested orig_ifinfo or NULL if not found. * * The object is returned with refcounter increased by 1. */ @@ -314,7 +335,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, if (tmp->if_outgoing != if_outgoing) continue; - if (!atomic_inc_not_zero(&tmp->refcount)) + if (!kref_get_unless_zero(&tmp->refcount)) continue; orig_ifinfo = tmp; @@ -330,7 +351,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing + * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing * interface otherwise. The object is created and added to the list * if it does not exist. * @@ -354,7 +375,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, goto out; if (if_outgoing != BATADV_IF_DEFAULT && - !atomic_inc_not_zero(&if_outgoing->refcount)) { + !kref_get_unless_zero(&if_outgoing->refcount)) { kfree(orig_ifinfo); orig_ifinfo = NULL; goto out; @@ -365,7 +386,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, orig_ifinfo->batman_seqno_reset = reset_time; orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); - atomic_set(&orig_ifinfo->refcount, 2); + kref_init(&orig_ifinfo->refcount); + kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); out: @@ -375,12 +397,12 @@ out: /** * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node - * @neigh_node: the neigh node to be queried + * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * The object is returned with refcounter increased by 1. * - * Returns the requested neigh_ifinfo or NULL if not found + * Return: the requested neigh_ifinfo or NULL if not found */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, @@ -395,7 +417,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, if (tmp_neigh_ifinfo->if_outgoing != if_outgoing) continue; - if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount)) continue; neigh_ifinfo = tmp_neigh_ifinfo; @@ -408,10 +430,10 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, /** * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object - * @neigh_node: the neigh node to be queried + * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns NULL in case of failure or the neigh_ifinfo object for the + * Return: NULL in case of failure or the neigh_ifinfo object for the * if_outgoing interface otherwise. The object is created and added to the list * if it does not exist. * @@ -433,14 +455,15 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, if (!neigh_ifinfo) goto out; - if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) { + if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) { kfree(neigh_ifinfo); neigh_ifinfo = NULL; goto out; } INIT_HLIST_NODE(&neigh_ifinfo->list); - atomic_set(&neigh_ifinfo->refcount, 2); + kref_init(&neigh_ifinfo->refcount); + kref_get(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); @@ -459,7 +482,8 @@ out: * * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. - * Returns NULL if the neighbour is not found. + * + * Return: neighbor when found. Othwerwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -476,7 +500,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, if (tmp_neigh_node->if_incoming != hard_iface) continue; - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; res = tmp_neigh_node; @@ -492,7 +516,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * - * Returns the hardif neighbour node if found or created or NULL otherwise. + * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, @@ -508,12 +532,12 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, if (hardif_neigh) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) goto out; hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); if (!hardif_neigh) { - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); goto out; } @@ -522,7 +546,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, hardif_neigh->if_incoming = hard_iface; hardif_neigh->last_seen = jiffies; - atomic_set(&hardif_neigh->refcount, 1); + kref_init(&hardif_neigh->refcount); if (bat_priv->bat_algo_ops->bat_hardif_neigh_init) bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh); @@ -540,7 +564,7 @@ out: * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * - * Returns the hardif neighbour node if found or created or NULL otherwise. + * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, @@ -562,7 +586,8 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * @neigh_addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this hard interface. - * Returns NULL if the neighbour is not found. + * + * Return: neighbor when found. Othwerwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -576,7 +601,7 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr)) continue; - if (!atomic_inc_not_zero(&tmp_hardif_neigh->refcount)) + if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount)) continue; hardif_neigh = tmp_hardif_neigh; @@ -594,7 +619,8 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. - * Returns the new object or NULL on failure. + * + * Return: neighbor when found. Othwerwise NULL */ struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, @@ -617,7 +643,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, if (!neigh_node) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) { + if (!kref_get_unless_zero(&hard_iface->refcount)) { kfree(neigh_node); neigh_node = NULL; goto out; @@ -630,24 +656,27 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; + neigh_node->last_seen = jiffies; + + /* increment unique neighbor refcount */ + kref_get(&hardif_neigh->refcount); + neigh_node->hardif_neigh = hardif_neigh; /* extra reference for return */ - atomic_set(&neigh_node->refcount, 2); + kref_init(&neigh_node->refcount); + kref_get(&neigh_node->refcount); spin_lock_bh(&orig_node->neigh_list_lock); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); spin_unlock_bh(&orig_node->neigh_list_lock); - /* increment unique neighbor refcount */ - atomic_inc(&hardif_neigh->refcount); - batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); out: if (hardif_neigh) - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } @@ -656,7 +685,7 @@ out: * @seq: neighbour table seq_file struct * @offset: not used * - * Always returns 0. + * Return: always 0 */ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) { @@ -673,7 +702,7 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name, bat_priv->bat_algo_ops->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (!bat_priv->bat_algo_ops->bat_neigh_print) { seq_puts(seq, @@ -688,32 +717,34 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) /** * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for * free after rcu grace period - * @orig_ifinfo: the orig_ifinfo object to release + * @ref: kref pointer of the orig_ifinfo */ -static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo) +static void batadv_orig_ifinfo_release(struct kref *ref) { + struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; + orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount); + if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref(orig_ifinfo->if_outgoing); + batadv_hardif_put(orig_ifinfo->if_outgoing); /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release + * batadv_orig_ifinfo_put - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) +void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - batadv_orig_ifinfo_release(orig_ifinfo); + kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** @@ -740,27 +771,30 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) /** * batadv_orig_node_release - release orig_node from lists and queue for * free after rcu grace period - * @orig_node: the orig node to free + * @ref: kref pointer of the orig_node */ -static void batadv_orig_node_release(struct batadv_orig_node *orig_node) +static void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; + orig_node = container_of(ref, struct batadv_orig_node, refcount); + spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); } spin_unlock_bh(&orig_node->neigh_list_lock); @@ -771,14 +805,13 @@ static void batadv_orig_node_release(struct batadv_orig_node *orig_node) } /** - * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly + * batadv_orig_node_put - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ -void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) +void batadv_orig_node_put(struct batadv_orig_node *orig_node) { - if (atomic_dec_and_test(&orig_node->refcount)) - batadv_orig_node_release(orig_node); + kref_put(&orig_node->refcount, batadv_orig_node_release); } void batadv_originator_free(struct batadv_priv *bat_priv) @@ -805,7 +838,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { hlist_del_rcu(&orig_node->hash_entry); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } spin_unlock_bh(list_lock); } @@ -820,7 +853,8 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * * Creates a new originator object and initialise all the generic fields. * The new object is not added to the originator list. - * Returns the newly created object or NULL on failure. + * + * Return: the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr) @@ -849,7 +883,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, batadv_nc_init_orig(orig_node); /* extra reference for return */ - atomic_set(&orig_node->refcount, 2); + kref_init(&orig_node->refcount); + kref_get(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); @@ -877,7 +912,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, * Immediately release vlan since it is not needed anymore in this * context */ - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { INIT_HLIST_HEAD(&orig_node->fragments[i].head); @@ -926,7 +961,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, neigh->addr, if_outgoing->net_dev->name); hlist_del_rcu(&neigh_ifinfo->list); - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } spin_unlock_bh(&neigh->ifinfo_lock); @@ -937,7 +972,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * - * Returns true if any ifinfo entry was purged, false otherwise. + * Return: true if any ifinfo entry was purged, false otherwise. */ static bool batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, @@ -972,10 +1007,10 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, ifinfo_purged = true; hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); if (orig_node->last_bonding_candidate == orig_ifinfo) { orig_node->last_bonding_candidate = NULL; - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); } } @@ -989,7 +1024,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * - * Returns true if any neighbor was purged, false otherwise + * Return: true if any neighbor was purged, false otherwise */ static bool batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, @@ -1029,7 +1064,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, neigh_purged = true; hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); } else { /* only necessary if not the whole neighbor is to be * deleted, but some interface has been removed. @@ -1048,7 +1083,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared * - * Returns the current best neighbor, with refcount increased. + * Return: the current best neighbor, with refcount increased. */ static struct batadv_neigh_node * batadv_find_best_neighbor(struct batadv_priv *bat_priv, @@ -1064,11 +1099,11 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, best, if_outgoing) <= 0)) continue; - if (!atomic_inc_not_zero(&neigh->refcount)) + if (!kref_get_unless_zero(&neigh->refcount)) continue; if (best) - batadv_neigh_node_free_ref(best); + batadv_neigh_node_put(best); best = neigh; } @@ -1085,7 +1120,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, * This function checks if the orig_node or substructures of it have become * obsolete, and purges this information if that's the case. * - * Returns true if the orig_node is to be removed, false otherwise. + * Return: true if the orig_node is to be removed, false otherwise. */ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) @@ -1114,7 +1149,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); if (best_neigh_node) - batadv_neigh_node_free_ref(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); @@ -1131,7 +1166,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); if (best_neigh_node) - batadv_neigh_node_free_ref(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); } rcu_read_unlock(); @@ -1164,7 +1199,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) batadv_tt_global_del_orig(orig_node->bat_priv, orig_node, -1, "originator timed out"); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); continue; } @@ -1210,7 +1245,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name, bat_priv->bat_algo_ops->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (!bat_priv->bat_algo_ops->bat_orig_print) { seq_puts(seq, @@ -1230,7 +1265,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) * @seq: debugfs table seq_file struct * @offset: not used * - * Returns 0 + * Return: 0 */ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) { @@ -1266,7 +1301,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return 0; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index cf0730414..4e8b67f11 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -20,10 +20,10 @@ #include "main.h" -#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/kref.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/stddef.h> @@ -37,19 +37,19 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); -void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); +void batadv_orig_node_put(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void -batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh); +batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); +void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); @@ -59,7 +59,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo); +void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); @@ -69,7 +69,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo); +void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); @@ -83,7 +83,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); -void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); +void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan); /* hashfunction to choose an entry in a hash table of given size * hash algorithm from http://en.wikipedia.org/wiki/Hash_table @@ -115,7 +115,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) if (!batadv_compare_eth(orig_node, data)) continue; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) continue; orig_node_tmp = orig_node; diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 0558e3237..8a8d7ca1a 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -26,6 +26,8 @@ * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV * @BATADV_BCAST: broadcast packets carrying broadcast payload * @BATADV_CODED: network coded packets + * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V + * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V * * @BATADV_UNICAST: unicast packets carrying unicast payload traffic * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original @@ -40,6 +42,8 @@ enum batadv_packettype { BATADV_IV_OGM = 0x00, BATADV_BCAST = 0x01, BATADV_CODED = 0x02, + BATADV_ELP = 0x03, + BATADV_OGM2 = 0x04, /* 0x40 - 0x7f: unicast */ #define BATADV_UNICAST_MIN 0x40 BATADV_UNICAST = 0x40, @@ -158,7 +162,7 @@ enum batadv_tt_client_flags { }; /** - * batadv_vlan_flags - flags for the four MSB of any vlan ID field + * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not */ enum batadv_vlan_flags { @@ -209,6 +213,11 @@ struct batadv_bla_claim_dst { * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header * @flags: contains routing relevant flags - see enum batadv_iv_flags + * @seqno: sequence identification + * @orig: address of the source node + * @prev_sender: address of the previous sender + * @reserved: reserved byte for alignment + * @tq: transmission quality * @tvlv_len: length of tvlv data following the ogm header */ struct batadv_ogm_packet { @@ -230,7 +239,52 @@ struct batadv_ogm_packet { #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) /** - * batadv_icmp_header - common members among all the ICMP packets + * struct batadv_ogm2_packet - ogm2 (routing protocol) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header + * @flags: reseved for routing relevant flags - currently always 0 + * @seqno: sequence number + * @orig: originator mac address + * @tvlv_len: length of the appended tvlv buffer (in bytes) + * @throughput: the currently flooded path throughput + */ +struct batadv_ogm2_packet { + u8 packet_type; + u8 version; + u8 ttl; + u8 flags; + __be32 seqno; + u8 orig[ETH_ALEN]; + __be16 tvlv_len; + __be32 throughput; + /* __packed is not needed as the struct size is divisible by 4, + * and the largest data type in this struct has a size of 4. + */ +}; + +#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet) + +/** + * struct batadv_elp_packet - elp (neighbor discovery) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @orig: originator mac address + * @seqno: sequence number + * @elp_interval: currently used ELP sending interval in ms + */ +struct batadv_elp_packet { + u8 packet_type; + u8 version; + u8 orig[ETH_ALEN]; + __be32 seqno; + __be32 elp_interval; +}; + +#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) + +/** + * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -256,7 +310,7 @@ struct batadv_icmp_header { }; /** - * batadv_icmp_packet - ICMP packet + * struct batadv_icmp_packet - ICMP packet * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -282,7 +336,7 @@ struct batadv_icmp_packet { #define BATADV_RR_LEN 16 /** - * batadv_icmp_packet_rr - ICMP RouteRecord packet + * struct batadv_icmp_packet_rr - ICMP RouteRecord packet * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -345,6 +399,7 @@ struct batadv_unicast_packet { * @u: common unicast packet header * @src: address of the source * @subtype: packet subtype + * @reserved: reserved byte for alignment */ struct batadv_unicast_4addr_packet { struct batadv_unicast_packet u; @@ -413,7 +468,6 @@ struct batadv_bcast_packet { * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header - * @reserved: Align following fields to 2-byte boundaries * @first_source: original source of first included packet * @first_orig_dest: original destinal of first included packet * @first_crc: checksum of first included packet @@ -495,7 +549,7 @@ struct batadv_tvlv_gateway_data { * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container * @flags: translation table flags (see batadv_tt_data_flags) * @ttvn: translation table version number - * @vlan_num: number of announced VLANs. In the TVLV this struct is followed by + * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by * one batadv_tvlv_tt_vlan_data object per announced vlan */ struct batadv_tvlv_tt_data { diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 43d15d6c4..b781bf753 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -25,6 +25,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/jiffies.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> @@ -72,7 +73,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, rcu_read_lock(); curr_router = rcu_dereference(orig_ifinfo->router); - if (curr_router && !atomic_inc_not_zero(&curr_router->refcount)) + if (curr_router && !kref_get_unless_zero(&curr_router->refcount)) curr_router = NULL; rcu_read_unlock(); @@ -97,10 +98,10 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, } if (curr_router) - batadv_neigh_node_free_ref(curr_router); + batadv_neigh_node_put(curr_router); /* increase refcount of new best neighbor */ - if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount)) + if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount)) neigh_node = NULL; spin_lock_bh(&orig_node->neigh_list_lock); @@ -115,11 +116,11 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, rcu_assign_pointer(orig_ifinfo->router, neigh_node); spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); /* decrease refcount of previous best neighbor */ if (curr_router) - batadv_neigh_node_free_ref(curr_router); + batadv_neigh_node_put(curr_router); } /** @@ -146,24 +147,38 @@ void batadv_update_route(struct batadv_priv *bat_priv, out: if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); } -/* checks whether the host restarted and is in the protection time. - * returns: - * 0 if the packet is to be accepted +/** + * batadv_window_protected - checks whether the host restarted and is in the + * protection time. + * @bat_priv: the bat priv with all the soft interface information + * @seq_num_diff: difference between the current/received sequence number and + * the last sequence number + * @seq_old_max_diff: maximum age of sequence number not considered as restart + * @last_reset: jiffies timestamp of the last reset, will be updated when reset + * is detected + * @protection_started: is set to true if the protection window was started, + * doesn't change otherwise. + * + * Return: + * 0 if the packet is to be accepted. * 1 if the packet is to be ignored. */ int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - unsigned long *last_reset) + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started) { - if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE || + if (seq_num_diff <= -seq_old_max_diff || seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) { if (!batadv_has_timed_out(*last_reset, BATADV_RESET_PROTECTION_MS)) return 1; *last_reset = jiffies; + if (protection_started) + *protection_started = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "old packet received, start protection\n"); } @@ -207,7 +222,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, * @bat_priv: the bat priv with all the soft interface information * @skb: icmp packet to process * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, @@ -263,9 +278,9 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -311,9 +326,9 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -397,7 +412,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -407,10 +422,11 @@ out: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. Returns negative - * value when check fails and 0 otherwise. The negative value depends on the - * reason: -ENODATA for bad header, -EBADR for broadcast destination or source, - * and -EREMOTE for non-local (other host) destination. + * Check for short header and bad addresses in given packet. + * + * Return: negative value when check fails and 0 otherwise. The negative value + * depends on the reason: -ENODATA for bad header, -EBADR for broadcast + * destination or source, and -EREMOTE for non-local (other host) destination. */ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -444,7 +460,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, * @orig_node: the destination node * @recv_if: pointer to interface this packet was received on * - * Returns the router which should be used for this orig_node on + * Return: the router which should be used for this orig_node on * this interface, or NULL if not available. */ struct batadv_neigh_node * @@ -491,14 +507,14 @@ batadv_find_router(struct batadv_priv *bat_priv, hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) { /* acquire some structures and references ... */ - if (!atomic_inc_not_zero(&cand->refcount)) + if (!kref_get_unless_zero(&cand->refcount)) continue; cand_router = rcu_dereference(cand->router); if (!cand_router) goto next; - if (!atomic_inc_not_zero(&cand_router->refcount)) { + if (!kref_get_unless_zero(&cand_router->refcount)) { cand_router = NULL; goto next; } @@ -517,8 +533,8 @@ batadv_find_router(struct batadv_priv *bat_priv, /* mark the first possible candidate */ if (!first_candidate) { - atomic_inc(&cand_router->refcount); - atomic_inc(&cand->refcount); + kref_get(&cand_router->refcount); + kref_get(&cand->refcount); first_candidate = cand; first_candidate_router = cand_router; } @@ -538,16 +554,16 @@ batadv_find_router(struct batadv_priv *bat_priv, next: /* free references */ if (cand_router) { - batadv_neigh_node_free_ref(cand_router); + batadv_neigh_node_put(cand_router); cand_router = NULL; } - batadv_orig_ifinfo_free_ref(cand); + batadv_orig_ifinfo_put(cand); } rcu_read_unlock(); /* last_bonding_candidate is reset below, remove the old reference. */ if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate); + batadv_orig_ifinfo_put(orig_node->last_bonding_candidate); /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that @@ -555,17 +571,17 @@ next: * 3) there is no candidate at all, return the default router */ if (next_candidate) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); /* remove references to first candidate, we don't need it. */ if (first_candidate) { - batadv_neigh_node_free_ref(first_candidate_router); - batadv_orig_ifinfo_free_ref(first_candidate); + batadv_neigh_node_put(first_candidate_router); + batadv_orig_ifinfo_put(first_candidate); } router = next_candidate_router; orig_node->last_bonding_candidate = next_candidate; } else if (first_candidate) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); /* refcounting has already been done in the loop above. */ router = first_candidate_router; @@ -642,7 +658,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -657,7 +673,7 @@ out: * the new corresponding information (originator address where the destination * client currently is and its known TTVN) * - * Returns true if the packet header has been updated, false otherwise + * Return: true if the packet header has been updated, false otherwise */ static bool batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, @@ -695,9 +711,9 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, ret = true; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -761,7 +777,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, return 0; curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } /* check if the TTVN contained in the packet is fresher than what the @@ -801,7 +817,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); unicast_packet->ttvn = curr_ttvn; @@ -814,7 +830,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, @@ -901,7 +917,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, rx_success: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -913,9 +929,8 @@ rx_success: * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on - * @dst_addr: the payload destination * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ int batadv_recv_unicast_tvlv(struct sk_buff *skb, @@ -969,7 +984,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till * lack further fragments; 3) Merge fragments, if we have all needed parts. * - * Return NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. + * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. */ int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) @@ -1013,7 +1028,7 @@ int batadv_recv_frag_packet(struct sk_buff *skb, out: if (orig_node_src) - batadv_orig_node_free_ref(orig_node_src); + batadv_orig_node_put(orig_node_src); return ret; } @@ -1074,7 +1089,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, /* check whether the packet is old and the host just restarted. */ if (batadv_window_protected(bat_priv, seq_diff, - &orig_node->bcast_seqno_reset)) + BATADV_BCAST_MAX_AGE, + &orig_node->bcast_seqno_reset, NULL)) goto spin_unlock; /* mark broadcast in flood history, update window position @@ -1117,6 +1133,6 @@ spin_unlock: spin_unlock_bh(&orig_node->bcast_seqno_lock); out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 204bbe495..02a5caa84 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -52,6 +52,7 @@ batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - unsigned long *last_reset); + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 45bfdefa1..76417850d 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -49,16 +49,30 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work); -/* send out an already prepared packet to the given address via the - * specified batman interface +/** + * batadv_send_skb_packet - send an already prepared packet + * @skb: the packet to send + * @hard_iface: the interface to use to send the broadcast packet + * @dst_addr: the payload destination + * + * Send out an already prepared packet to the given neighbor or broadcast it + * using the specified interface. Either hard_iface or neigh_node must be not + * NULL. + * If neigh_node is NULL, then the packet is broadcasted using hard_iface, + * otherwise it is sent as unicast to the given neighbor. + * + * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb) + * otherwise */ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv; struct ethhdr *ethhdr; + bat_priv = netdev_priv(hard_iface->soft_iface); + if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; @@ -100,6 +114,35 @@ send_skb_err: return NET_XMIT_DROP; } +int batadv_send_broadcast_skb(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface) +{ + return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); +} + +int batadv_send_unicast_skb(struct sk_buff *skb, + struct batadv_neigh_node *neigh) +{ +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_hardif_neigh_node *hardif_neigh; +#endif + int ret; + + ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr); + +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); + + if ((hardif_neigh) && (ret != NET_XMIT_DROP)) + hardif_neigh->bat_v.last_unicast_tx = jiffies; + + if (hardif_neigh) + batadv_hardif_neigh_put(hardif_neigh); +#endif + + return ret; +} + /** * batadv_send_skb_to_orig - Lookup next-hop and transmit skb. * @skb: Packet to be transmitted. @@ -111,7 +154,7 @@ send_skb_err: * host, NULL can be passed as recv_if and no interface alternating is * attempted. * - * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or + * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or * NET_XMIT_POLICED if the skb is buffered for later transmit. */ int batadv_send_skb_to_orig(struct sk_buff *skb, @@ -146,14 +189,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) { ret = NET_XMIT_POLICED; } else { - batadv_send_skb_packet(skb, neigh_node->if_incoming, - neigh_node->addr); + batadv_send_unicast_skb(skb, neigh_node); ret = NET_XMIT_SUCCESS; } out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -165,7 +207,7 @@ out: * @hdr_size: amount of bytes to push at the beginning of the skb * @orig_node: the destination node * - * Returns false if the buffer extension was not possible or true otherwise. + * Return: false if the buffer extension was not possible or true otherwise. */ static bool batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, @@ -196,7 +238,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * - * Returns false if the payload could not be encapsulated or true otherwise. + * Return: false if the payload could not be encapsulated or true otherwise. */ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, struct batadv_orig_node *orig_node) @@ -211,10 +253,10 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, * unicast 4addr header * @bat_priv: the bat priv with all the soft interface information * @skb: the skb containing the payload to encapsulate - * @orig_node: the destination node + * @orig: the destination node * @packet_subtype: the unicast 4addr packet subtype to use * - * Returns false if the payload could not be encapsulated or true otherwise. + * Return: false if the payload could not be encapsulated or true otherwise. */ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -246,7 +288,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, ret = true; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return ret; } @@ -265,7 +307,7 @@ out: * as packet_type. Then send this frame to the given orig_node and release a * reference to this orig_node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, @@ -317,7 +359,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); if (ret == NET_XMIT_DROP) kfree_skb(skb); return ret; @@ -339,7 +381,7 @@ out: * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame * to the according destination node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, @@ -373,7 +415,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, * Look up the currently selected gateway. Wrap the given skb into a batman-adv * unicast header and send this frame to this gateway node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) @@ -409,9 +451,9 @@ static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) - batadv_hardif_free_ref(forw_packet->if_incoming); + batadv_hardif_put(forw_packet->if_incoming); if (forw_packet->if_outgoing) - batadv_hardif_free_ref(forw_packet->if_outgoing); + batadv_hardif_put(forw_packet->if_outgoing); kfree(forw_packet); } @@ -430,14 +472,19 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, send_time); } -/* add a broadcast packet to the queue and setup timers. broadcast packets - * are sent multiple times to increase probability for being received. +/** + * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending * - * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on - * errors. + * add a broadcast packet to the queue and setup timers. broadcast packets + * are sent multiple times to increase probability for being received. * * The skb is not consumed, so the caller should make sure that the * skb is freed. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, @@ -492,7 +539,7 @@ out_and_inc: atomic_inc(&bat_priv->bcast_queue_left); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NETDEV_TX_BUSY; } @@ -533,8 +580,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb1) - batadv_send_skb_packet(skb1, hard_iface, - batadv_broadcast_addr); + batadv_send_broadcast_skb(skb1, hard_iface); } rcu_read_unlock(); diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 82059f259..6fd7270d8 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -28,12 +28,16 @@ struct sk_buff; struct work_struct; -int batadv_send_skb_packet(struct sk_buff *skb, - struct batadv_hard_iface *hard_iface, - const u8 *dst_addr); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); +int batadv_send_skb_packet(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface, + const u8 *dst_addr); +int batadv_send_broadcast_skb(struct sk_buff *skb, + struct batadv_hard_iface *hard_iface); +int batadv_send_unicast_skb(struct sk_buff *skb, + struct batadv_neigh_node *neigh_node); void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface); int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, @@ -69,7 +73,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * header via the translation table. Wrap the given skb into a batman-adv * unicast header. Then send this frame to the according destination node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *dst_hint, @@ -92,7 +96,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, * unicast-4addr header. Then send this frame to the according destination * node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 720f1a5b8..8a136b6a1 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -30,6 +30,7 @@ #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -376,7 +377,7 @@ dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NETDEV_TX_OK; } @@ -482,22 +483,34 @@ out: } /** - * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and - * possibly free it - * @softif_vlan: the vlan object to release + * batadv_softif_vlan_release - release vlan from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the vlan object */ -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) +static void batadv_softif_vlan_release(struct kref *ref) +{ + struct batadv_softif_vlan *vlan; + + vlan = container_of(ref, struct batadv_softif_vlan, refcount); + + spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); + hlist_del_rcu(&vlan->list); + spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); + + kfree_rcu(vlan, rcu); +} + +/** + * batadv_softif_vlan_put - decrease the vlan object refcounter and + * possibly release it + * @vlan: the vlan object to release + */ +void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; - if (atomic_dec_and_test(&vlan->refcount)) { - spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); - hlist_del_rcu(&vlan->list); - spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); - - kfree_rcu(vlan, rcu); - } + kref_put(&vlan->refcount, batadv_softif_vlan_release); } /** @@ -505,7 +518,7 @@ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * - * Returns the private data of the vlan matching the vid passed as argument or + * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, @@ -518,7 +531,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, if (vlan_tmp->vid != vid) continue; - if (!atomic_inc_not_zero(&vlan_tmp->refcount)) + if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; @@ -534,7 +547,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * - * Returns 0 on success, a negative error otherwise. + * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { @@ -543,7 +556,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return -EEXIST; } @@ -553,7 +566,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) vlan->bat_priv = bat_priv; vlan->vid = vid; - atomic_set(&vlan->refcount, 1); + kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); @@ -592,18 +605,19 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, vlan->vid, "vlan interface destroyed", false); batadv_sysfs_del_vlan(bat_priv, vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid - ndo_add_vid API implementation * @dev: the netdev of the mesh interface + * @proto: protocol of the the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * - * Returns 0 on success or a negative error code in case of failure. + * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) @@ -636,7 +650,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, if (!vlan->kobj) { ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); if (ret) { - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return ret; } } @@ -655,12 +669,13 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, /** * batadv_interface_kill_vid - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface + * @proto: protocol of the the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * - * Returns 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q + * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, @@ -682,7 +697,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return 0; } @@ -738,7 +753,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } batadv_sysfs_del_meshif(soft_iface); @@ -749,7 +764,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) * batadv_softif_init_late - late stage initialization of soft interface * @dev: registered network device to modify * - * Returns error code on failures + * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { @@ -851,7 +866,7 @@ free_bat_counters: * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * - * Return 0 if successful or error otherwise. + * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev) @@ -867,7 +882,7 @@ static int batadv_softif_slave_add(struct net_device *dev, out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -876,7 +891,7 @@ out: * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * - * Return 0 if successful or error otherwise. + * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) @@ -894,7 +909,7 @@ static int batadv_softif_slave_del(struct net_device *dev, out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 8e82176f4..9ae265703 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -34,7 +34,7 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface); int batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan); +void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fe87777fd..e7cf51333 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/if.h> #include <linux/if_vlan.h> +#include <linux/kref.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -64,7 +65,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj) * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv * @obj: kobject to covert * - * Returns the associated batadv_priv struct. + * Return: the associated batadv_priv struct. */ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) { @@ -82,9 +83,10 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) /** * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct + * @bat_priv: the bat priv with all the soft interface information * @obj: kobject to covert * - * Returns the associated softif_vlan struct if found, NULL otherwise. + * Return: the associated softif_vlan struct if found, NULL otherwise. */ static struct batadv_softif_vlan * batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) @@ -96,7 +98,7 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) if (vlan_tmp->kobj != obj) continue; - if (!atomic_inc_not_zero(&vlan_tmp->refcount)) + if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; @@ -214,7 +216,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \ attr, &vlan->_name, \ bat_priv->soft_iface); \ \ - batadv_softif_vlan_free_ref(vlan); \ + batadv_softif_vlan_put(vlan); \ return res; \ } @@ -229,7 +231,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ atomic_read(&vlan->_name) == 0 ? \ "disabled" : "enabled"); \ \ - batadv_softif_vlan_free_ref(vlan); \ + batadv_softif_vlan_put(vlan); \ return res; \ } @@ -240,6 +242,55 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \ batadv_store_vlan_##_name) +#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ +ssize_t batadv_store_##_name(struct kobject *kobj, \ + struct attribute *attr, char *buff, \ + size_t count) \ +{ \ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ + struct batadv_hard_iface *hard_iface; \ + ssize_t length; \ + \ + hard_iface = batadv_hardif_get_by_netdev(net_dev); \ + if (!hard_iface) \ + return 0; \ + \ + length = __batadv_store_uint_attr(buff, count, _min, _max, \ + _post_func, attr, \ + &hard_iface->_var, net_dev); \ + \ + batadv_hardif_put(hard_iface); \ + return length; \ +} + +#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \ +ssize_t batadv_show_##_name(struct kobject *kobj, \ + struct attribute *attr, char *buff) \ +{ \ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ + struct batadv_hard_iface *hard_iface; \ + ssize_t length; \ + \ + hard_iface = batadv_hardif_get_by_netdev(net_dev); \ + if (!hard_iface) \ + return 0; \ + \ + length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var)); \ + \ + batadv_hardif_put(hard_iface); \ + return length; \ +} + +/* Use this, if you are going to set [name] in hard_iface to an + * unsigned integer value + */ +#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ + static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, \ + _max, _post_func) \ + static BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \ + static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ + batadv_store_##_name) + static int batadv_store_bool_attr(char *buff, size_t count, struct net_device *net_dev, const char *attr_name, atomic_t *attr, @@ -491,7 +542,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, * @attr: the batman-adv attribute the user is interacting with * @buff: the buffer that will contain the data to send back to the user * - * Returns the number of bytes written into 'buff' on success or a negative + * Return: the number of bytes written into 'buff' on success or a negative * error code in case of failure */ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, @@ -511,7 +562,7 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, * @buff: the buffer containing the user data * @count: number of bytes in the buffer * - * Returns 'count' on success or a negative error code in case of failure + * Return: 'count' on success or a negative error code in case of failure */ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, struct attribute *attr, char *buff, @@ -620,9 +671,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = { BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL); -/** - * batadv_vlan_attrs - array of vlan specific sysfs attributes - */ +/* array of vlan specific sysfs attributes */ static struct batadv_attribute *batadv_vlan_attrs[] = { &batadv_attr_vlan_ap_isolation, NULL, @@ -683,7 +732,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev) * @dev: netdev of the mesh interface * @vlan: private data of the newly added VLAN interface * - * Returns 0 on success and -ENOMEM if any of the structure allocations fails. + * Return: 0 on success and -ENOMEM if any of the structure allocations fails. */ int batadv_sysfs_add_vlan(struct net_device *dev, struct batadv_softif_vlan *vlan) @@ -771,7 +820,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, length = sprintf(buff, "%s\n", ifname); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return length; } @@ -795,7 +844,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, if (strlen(buff) >= IFNAMSIZ) { pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", buff); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return -EINVAL; } @@ -829,7 +878,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, unlock: rtnl_unlock(); out: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -863,18 +912,99 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj, break; } - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return length; } +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + +/** + * batadv_store_throughput_override - parse and store throughput override + * entered by the user + * @kobj: kobject representing the private mesh sysfs directory + * @attr: the batman-adv attribute the user is interacting with + * @buff: the buffer containing the user data + * @count: number of bytes in the buffer + * + * Return: 'count' on success or a negative error code in case of failure + */ +static ssize_t batadv_store_throughput_override(struct kobject *kobj, + struct attribute *attr, + char *buff, size_t count) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_hard_iface *hard_iface; + u32 tp_override; + u32 old_tp_override; + bool ret; + + hard_iface = batadv_hardif_get_by_netdev(net_dev); + if (!hard_iface) + return -EINVAL; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + ret = batadv_parse_throughput(net_dev, buff, "throughput_override", + &tp_override); + if (!ret) + return count; + + old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override); + if (old_tp_override == tp_override) + goto out; + + batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n", + "throughput_override", + old_tp_override / 10, old_tp_override % 10, + tp_override / 10, tp_override % 10); + + atomic_set(&hard_iface->bat_v.throughput_override, tp_override); + +out: + batadv_hardif_put(hard_iface); + return count; +} + +static ssize_t batadv_show_throughput_override(struct kobject *kobj, + struct attribute *attr, + char *buff) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_hard_iface *hard_iface; + u32 tp_override; + + hard_iface = batadv_hardif_get_by_netdev(net_dev); + if (!hard_iface) + return -EINVAL; + + tp_override = atomic_read(&hard_iface->bat_v.throughput_override); + + return sprintf(buff, "%u.%u MBit\n", tp_override / 10, + tp_override % 10); +} + +#endif + static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface, batadv_store_mesh_iface); static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL); +#ifdef CONFIG_BATMAN_ADV_BATMAN_V +BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR, + 2 * BATADV_JITTER, INT_MAX, NULL); +static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR, + batadv_show_throughput_override, + batadv_store_throughput_override); +#endif static struct batadv_attribute *batadv_batman_attrs[] = { &batadv_attr_mesh_iface, &batadv_attr_iface_status, +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + &batadv_attr_elp_interval, + &batadv_attr_throughput_override, +#endif NULL, }; diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 61974428a..c76021b4e 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 0e80fd146..9b4551a86 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -31,6 +31,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -68,7 +69,15 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, unsigned short vid, const char *message, bool roaming); -/* returns 1 if they are the same mac addr and vid */ +/** + * batadv_compare_tt - check if two TT entries are the same + * @node: the list element pointer of the first TT entry + * @data2: pointer to the tt_common_entry of the second TT entry + * + * Compare the MAC address and the VLAN ID of the two TT entries and check if + * they are the same TT client. + * Return: 1 if the two TT clients are the same, 0 otherwise + */ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, @@ -84,7 +93,7 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * - * Returns the hash index where the object represented by 'data' should be + * Return: the hash index where the object represented by 'data' should be * stored at. */ static inline u32 batadv_choose_tt(const void *data, u32 size) @@ -105,7 +114,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size) * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the tt_common struct belonging to the searched client if + * Return: a pointer to the tt_common struct belonging to the searched client if * found, NULL otherwise. */ static struct batadv_tt_common_entry * @@ -133,7 +142,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, if (tt->vid != vid) continue; - if (!atomic_inc_not_zero(&tt->refcount)) + if (!kref_get_unless_zero(&tt->refcount)) continue; tt_tmp = tt; @@ -150,7 +159,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the corresponding tt_local_entry struct if the client is + * Return: a pointer to the corresponding tt_local_entry struct if the client is * found, NULL otherwise. */ static struct batadv_tt_local_entry * @@ -175,7 +184,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the corresponding tt_global_entry struct if the client + * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ static struct batadv_tt_global_entry * @@ -194,34 +203,70 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, return tt_global_entry; } +/** + * batadv_tt_local_entry_release - release tt_local_entry from lists and queue + * for free after rcu grace period + * @ref: kref pointer of the nc_node + */ +static void batadv_tt_local_entry_release(struct kref *ref) +{ + struct batadv_tt_local_entry *tt_local_entry; + + tt_local_entry = container_of(ref, struct batadv_tt_local_entry, + common.refcount); + + batadv_softif_vlan_put(tt_local_entry->vlan); + + kfree_rcu(tt_local_entry, common.rcu); +} + +/** + * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and + * possibly release it + * @tt_local_entry: tt_local_entry to be free'd + */ static void -batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry) +batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) +{ + kref_put(&tt_local_entry->common.refcount, + batadv_tt_local_entry_release); +} + +/** + * batadv_tt_global_entry_release - release tt_global_entry from lists and queue + * for free after rcu grace period + * @ref: kref pointer of the nc_node + */ +static void batadv_tt_global_entry_release(struct kref *ref) { - if (atomic_dec_and_test(&tt_local_entry->common.refcount)) - kfree_rcu(tt_local_entry, common.rcu); + struct batadv_tt_global_entry *tt_global_entry; + + tt_global_entry = container_of(ref, struct batadv_tt_global_entry, + common.refcount); + + batadv_tt_global_del_orig_list(tt_global_entry); + kfree_rcu(tt_global_entry, common.rcu); } /** - * batadv_tt_global_entry_free_ref - decrement the refcounter for a - * tt_global_entry and possibly free it - * @tt_global_entry: the object to free + * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and + * possibly release it + * @tt_global_entry: tt_global_entry to be free'd */ static void -batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) +batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { - if (atomic_dec_and_test(&tt_global_entry->common.refcount)) { - batadv_tt_global_del_orig_list(tt_global_entry); - kfree_rcu(tt_global_entry, common.rcu); - } + kref_put(&tt_global_entry->common.refcount, + batadv_tt_global_entry_release); } /** * batadv_tt_global_hash_count - count the number of orig entries - * @hash: hash table containing the tt entries + * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * - * Return the number of originators advertising the given address/data + * Return: the number of originators advertising the given address/data * (excluding ourself). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, @@ -235,7 +280,7 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, return 0; count = atomic_read(&tt_global_entry->orig_list_count); - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); return count; } @@ -258,7 +303,7 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, atomic_add(v, &vlan->tt.num_entries); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } /** @@ -286,9 +331,9 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, } /** - * batadv_tt_global_size_mod - change the size by v of the local table - * identified by vid - * @bat_priv: the bat priv with all the soft interface information + * batadv_tt_global_size_mod - change the size by v of the global table + * for orig_node identified by vid + * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier * @v: the amount to sum to the global table size */ @@ -305,12 +350,12 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, spin_lock_bh(&orig_node->vlan_list_lock); if (!hlist_unhashed(&vlan->list)) { hlist_del_init_rcu(&vlan->list); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } spin_unlock_bh(&orig_node->vlan_list_lock); } - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } /** @@ -340,22 +385,28 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, /** * batadv_tt_orig_list_entry_release - release tt orig entry from lists and * queue for free after rcu grace period - * @orig_entry: tt orig entry to be free'd + * @ref: kref pointer of the tt orig entry */ -static void -batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry) +static void batadv_tt_orig_list_entry_release(struct kref *ref) { - batadv_orig_node_free_ref(orig_entry->orig_node); + struct batadv_tt_orig_list_entry *orig_entry; + + orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, + refcount); + + batadv_orig_node_put(orig_entry->orig_node); kfree_rcu(orig_entry, rcu); } +/** + * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and + * possibly release it + * @orig_entry: tt orig entry to be free'd + */ static void -batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) +batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { - if (!atomic_dec_and_test(&orig_entry->refcount)) - return; - - batadv_tt_orig_list_entry_release(orig_entry); + kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } /** @@ -437,7 +488,7 @@ unlock: * batadv_tt_len - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * - * Returns computed length in bytes. + * Return: computed length in bytes. */ static int batadv_tt_len(int changes_num) { @@ -448,7 +499,7 @@ static int batadv_tt_len(int changes_num) * batadv_tt_entries - compute the number of entries fitting in tt_len bytes * @tt_len: available space * - * Returns the number of entries. + * Return: the number of entries. */ static u16 batadv_tt_entries(u16 tt_len) { @@ -460,7 +511,7 @@ static u16 batadv_tt_entries(u16 tt_len) * size when transmitted over the air * @bat_priv: the bat priv with all the soft interface information * - * Returns local translation table size in bytes. + * Return: local translation table size in bytes. */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { @@ -512,7 +563,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } /** @@ -526,7 +577,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * @mark: the value contained in the skb->mark field of the received packet (if * any) * - * Returns true if the client was successfully added, false otherwise. + * Return: true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) @@ -620,9 +671,11 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, tt_local->common.vid = vid; if (batadv_is_wifi_netdev(in_dev)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; - atomic_set(&tt_local->common.refcount, 2); + kref_init(&tt_local->common.refcount); + kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; + tt_local->vlan = vlan; /* the batman interface mac and multicast addresses should never be * purged @@ -637,8 +690,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_tt_local_entry_free_ref(tt_local); - batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_put(tt_local); + batadv_softif_vlan_put(vlan); goto out; } @@ -704,9 +757,9 @@ out: if (in_dev) dev_put(in_dev); if (tt_local) - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); if (tt_global) - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); return ret; } @@ -721,12 +774,11 @@ out: * function reserves the amount of space needed to send the entire global TT * table. In case of success the value is updated with the real amount of * reserved bytes - * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * - * Return the size of the allocated buffer or 0 in case of failure. + * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, @@ -800,7 +852,7 @@ out: * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN. * - * Return the size of the allocated buffer or 0 in case of failure. + * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, @@ -942,7 +994,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; - struct batadv_softif_vlan *vlan; struct hlist_head *head; unsigned short vid; u32 i; @@ -978,14 +1029,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) last_seen_msecs = last_seen_msecs % 1000; no_purge = tt_common_entry->flags & np_flag; - - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) { - seq_printf(seq, "Cannot retrieve VLAN %d\n", - BATADV_PRINT_VID(vid)); - continue; - } - seq_printf(seq, " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, @@ -1003,15 +1046,13 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, - vlan->tt.crc); - - batadv_softif_vlan_free_ref(vlan); + tt_local->vlan->tt.crc); } rcu_read_unlock(); } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1042,7 +1083,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, * @message: message to append to the log on deletion * @roaming: true if the deletion is due to a roaming event * - * Returns the flags assigned to the local entry before being deleted + * Return: the flags assigned to the local entry before being deleted */ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, @@ -1050,7 +1091,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, { struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; - struct batadv_softif_vlan *vlan; void *tt_entry_exists; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); @@ -1088,19 +1128,11 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, goto out; /* extra call to free the local tt entry */ - batadv_tt_local_entry_free_ref(tt_local_entry); - - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) - goto out; - - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_put(tt_local_entry); out: if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } @@ -1170,7 +1202,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; @@ -1192,15 +1223,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, - tt_common_entry->vid); - if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); - } - - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } @@ -1242,10 +1265,16 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.changes_list_lock); } -/* retrieves the orig_tt_list_entry belonging to orig_node from the +/** + * batadv_tt_global_orig_entry_find - find a TT orig_list_entry + * @entry: the TT global entry where the orig_list_entry has to be + * extracted from + * @orig_node: the originator for which the orig_list_entry has to be found + * + * retrieve the orig_tt_list_entry belonging to orig_node from the * batadv_tt_global_entry list * - * returns it with an increased refcounter, NULL if not found + * Return: it with an increased refcounter, NULL if not found */ static struct batadv_tt_orig_list_entry * batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, @@ -1259,7 +1288,7 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; - if (!atomic_inc_not_zero(&tmp_orig_entry->refcount)) + if (!kref_get_unless_zero(&tmp_orig_entry->refcount)) continue; orig_entry = tmp_orig_entry; @@ -1270,8 +1299,15 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, return orig_entry; } -/* find out if an orig_node is already in the list of a tt_global_entry. - * returns true if found, false otherwise +/** + * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled + * by a given originator + * @entry: the TT global entry to check + * @orig_node: the originator to search in the list + * + * find out if an orig_node is already in the list of a tt_global_entry. + * + * Return: true if found, false otherwise */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, @@ -1283,7 +1319,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } return found; @@ -1309,11 +1345,12 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, goto out; INIT_HLIST_NODE(&orig_entry->list); - atomic_inc(&orig_node->refcount); + kref_get(&orig_node->refcount); batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; - atomic_set(&orig_entry->refcount, 2); + kref_init(&orig_entry->refcount); + kref_get(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); hlist_add_head_rcu(&orig_entry->list, @@ -1323,7 +1360,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, out: if (orig_entry) - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } /** @@ -1343,7 +1380,7 @@ out: * * The caller must hold orig_node refcount. * - * Return true if the new entry has been added, false otherwise + * Return: true if the new entry has been added, false otherwise */ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -1389,7 +1426,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, */ if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; - atomic_set(&common->refcount, 2); + kref_init(&common->refcount); + kref_get(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); @@ -1403,7 +1441,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); goto out_remove; } } else { @@ -1489,9 +1527,9 @@ out_remove: out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -1501,7 +1539,7 @@ out: * @tt_global_entry: global translation table entry to be analyzed * * This functon assumes the caller holds rcu_read_lock(). - * Returns best originator list entry or NULL on errors. + * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * batadv_transtable_best_orig(struct batadv_priv *bat_priv, @@ -1522,20 +1560,20 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, if (best_router && bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT, best_router, BATADV_IF_DEFAULT) <= 0) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); continue; } /* release the refcount for the "old" best */ if (best_router) - batadv_neigh_node_free_ref(best_router); + batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } if (best_router) - batadv_neigh_node_free_ref(best_router); + batadv_neigh_node_put(best_router); return best_entry; } @@ -1588,7 +1626,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } print_list: @@ -1620,7 +1658,7 @@ print_list: ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } } @@ -1661,7 +1699,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1689,7 +1727,7 @@ _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, * being part of a list */ hlist_del_rcu(&orig_entry->list); - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } /* deletes the orig list of a tt_global_entry */ @@ -1845,9 +1883,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (local_entry) - batadv_tt_local_entry_free_ref(local_entry); + batadv_tt_local_entry_put(local_entry); } /** @@ -1901,7 +1939,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, tt_global->common.addr, BATADV_PRINT_VID(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } } spin_unlock_bh(list_lock); @@ -1964,7 +2002,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) hlist_del_rcu(&tt_common->hash_entry); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } @@ -1996,7 +2034,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } @@ -2031,7 +2069,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, * @addr: mac address of the destination client * @vid: VLAN identifier * - * Returns a pointer to the originator that was selected as destination in the + * Return: a pointer to the originator that was selected as destination in the * mesh for contacting the client 'addr', NULL otherwise. * In case of multiple originators serving the same client, the function returns * the best one (best in terms of metric towards the destination node). @@ -2071,15 +2109,15 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, /* found anything? */ if (best_entry) orig_node = best_entry->orig_node; - if (orig_node && !atomic_inc_not_zero(&orig_node->refcount)) + if (orig_node && !kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; rcu_read_unlock(); out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return orig_node; } @@ -2106,7 +2144,7 @@ out: * because the XOR operation can combine them all while trying to reduce the * noise as much as possible. * - * Returns the checksum of the global table of a given originator. + * Return: the checksum of the global table of a given originator. */ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -2183,7 +2221,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, * For details about the computation, please refer to the documentation for * batadv_tt_global_crc(). * - * Returns the checksum of the local table + * Return: the checksum of the local table */ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, unsigned short vid) @@ -2289,7 +2327,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node this request is being issued for * - * Returns the pointer to the new tt_req_node struct if no request + * Return: the pointer to the new tt_req_node struct if no request * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * @@ -2324,7 +2362,7 @@ unlock: * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * - * Returns 1 if the entry is a valid, 0 otherwise. + * Return: 1 if the entry is a valid, 0 otherwise. */ static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) { @@ -2408,9 +2446,8 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries - * @create: if true, create VLAN objects if not found * - * Return true if all the received CRCs match the locally stored ones, false + * Return: true if all the received CRCs match the locally stored ones, false * otherwise */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, @@ -2440,7 +2477,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, return false; crc = vlan->tt.crc; - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); if (crc != ntohl(tt_vlan_tmp->crc)) return false; @@ -2513,6 +2550,8 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, * @num_vlan: number of tvlv VLAN entries * @full_table: ask for the entire translation table if true, while only for the * last TT diff otherwise + * + * Return: true if the TT Request was sent, false otherwise */ static int batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, @@ -2573,7 +2612,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); /* hlist_del_init() verifies tt_req_node still is in the list */ @@ -2593,7 +2632,7 @@ out: * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2711,9 +2750,9 @@ unlock: out: if (res_dst_orig_node) - batadv_orig_node_free_ref(res_dst_orig_node); + batadv_orig_node_put(res_dst_orig_node); if (req_dst_orig_node) - batadv_orig_node_free_ref(req_dst_orig_node); + batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } @@ -2725,7 +2764,7 @@ out: * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2828,9 +2867,9 @@ unlock: out: spin_unlock_bh(&bat_priv->tt.commit_lock); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; @@ -2843,7 +2882,7 @@ out: * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2916,7 +2955,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, @@ -2938,7 +2977,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, * @addr: the mac address of the client to check * @vid: VLAN identifier * - * Returns true if the client is served by this node, false otherwise. + * Return: true if the client is served by this node, false otherwise. */ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -2958,7 +2997,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, ret = true; out: if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3022,7 +3061,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->tt.req_list_lock); out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) @@ -3055,11 +3094,16 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.roam_list_lock); } -/* This function checks whether the client already reached the +/** + * batadv_tt_check_roam_count - check if a client has roamed too frequently + * @bat_priv: the bat priv with all the soft interface information + * @client: mac address of the roaming client + * + * This function checks whether the client already reached the * maximum number of possible roaming phases. In this case the ROAMING_ADV * will not be sent. * - * returns true if the ROAMING_ADV can be sent, false otherwise + * Return: true if the ROAMING_ADV can be sent, false otherwise */ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { @@ -3148,7 +3192,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) @@ -3239,7 +3283,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -3269,14 +3312,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); - if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); - } - - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } @@ -3359,11 +3395,11 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, ret = true; out: - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3371,13 +3407,12 @@ out: * batadv_tt_update_orig - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the soft interface information - * @orig: the orig_node of the ogm - * @tt_vlan: pointer to the first tvlv VLAN entry + * @orig_node: the orig_node of the ogm + * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries * @tt_change: pointer to the first entry in the TT buffer * @tt_num_changes: number of tt changes inside the tt buffer * @ttvn: translation table version number of this changeset - * @tt_crc: crc32 checksum of orig node's translation table */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -3459,7 +3494,7 @@ request_table: * @addr: the mac address of the client to check * @vid: VLAN identifier * - * Returns true if we know that the client has moved from its old originator + * Return: true if we know that the client has moved from its old originator * to another one. This entry is still kept for consistency purposes and will be * deleted later by a DEL or because of timeout */ @@ -3474,7 +3509,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, goto out; ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); out: return ret; } @@ -3485,7 +3520,7 @@ out: * @addr: the mac address of the local client to query * @vid: VLAN identifier * - * Returns true if the local client is known to be roaming (it is not served by + * Return: true if the local client is known to be roaming (it is not served by * this node anymore) or not. If yes, the client is still present in the table * to keep the latter consistent with the node TTVN */ @@ -3500,7 +3535,7 @@ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, goto out; ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM; - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); out: return ret; } @@ -3614,7 +3649,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * - * Returns NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS + * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, @@ -3695,7 +3730,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * - * Returns NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS + * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, @@ -3733,7 +3768,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -3741,7 +3776,7 @@ out: * batadv_tt_init - initialise the translation table internals * @bat_priv: the bat priv with all the soft interface information * - * Return 0 on success or negative error number in case of failure. + * Return: 0 on success or negative error number in case of failure. */ int batadv_tt_init(struct batadv_priv *bat_priv) { @@ -3779,7 +3814,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * - * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false + * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, @@ -3794,7 +3829,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; - batadv_tt_global_entry_free_ref(tt); + batadv_tt_global_entry_put(tt); return ret; } diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index abd8e116e..7c7e2c006 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 3437b667a..1e47fbe8b 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -22,9 +22,11 @@ #error only "main.h" can be included directly #endif +#include <linux/average.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/if_ether.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/sched.h> /* for linux/wait.h */ #include <linux/spinlock.h> @@ -73,7 +75,7 @@ enum batadv_dhcp_recipient { #define BATADV_TT_SYNC_MASK 0x00F0 /** - * struct batadv_hard_iface_bat_iv - per hard interface B.A.T.M.A.N. IV data + * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data * @ogm_buff: buffer holding the OGM packet * @ogm_buff_len: length of the OGM packet buffer * @ogm_seqno: OGM sequence number - used to identify each OGM @@ -85,6 +87,36 @@ struct batadv_hard_iface_bat_iv { }; /** + * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V + * @BATADV_FULL_DUPLEX: tells if the connection over this link is full-duplex + * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that no + * throughput data is available for this interface and that default values are + * assumed. + */ +enum batadv_v_hard_iface_flags { + BATADV_FULL_DUPLEX = BIT(0), + BATADV_WARNING_DEFAULT = BIT(1), +}; + +/** + * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data + * @elp_interval: time interval between two ELP transmissions + * @elp_seqno: current ELP sequence number + * @elp_skb: base skb containing the ELP message to send + * @elp_wq: workqueue used to schedule ELP transmissions + * @throughput_override: throughput override to disable link auto-detection + * @flags: interface specific flags + */ +struct batadv_hard_iface_bat_v { + atomic_t elp_interval; + atomic_t elp_seqno; + struct sk_buff *elp_skb; + struct delayed_work elp_wq; + atomic_t throughput_override; + u8 flags; +}; + +/** * struct batadv_hard_iface - network device known to batman-adv * @list: list node for batadv_hardif_list * @if_num: identificator of the interface @@ -97,8 +129,9 @@ struct batadv_hard_iface_bat_iv { * batman-adv for this interface * @soft_iface: the batman-adv interface which uses this network interface * @rcu: struct used for freeing in an RCU-safe manner - * @bat_iv: BATMAN IV specific per hard interface data - * @cleanup_work: work queue callback item for hard interface deinit + * @bat_iv: per hard-interface B.A.T.M.A.N. IV data + * @bat_v: per hard-interface B.A.T.M.A.N. V data + * @cleanup_work: work queue callback item for hard-interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs * @neigh_list: list of unique single hop neighbors via this interface * @neigh_list_lock: lock protecting neigh_list @@ -110,11 +143,14 @@ struct batadv_hard_iface { struct net_device *net_dev; u8 num_bcasts; struct kobject *hardif_obj; - atomic_t refcount; + struct kref refcount; struct packet_type batman_adv_ptype; struct net_device *soft_iface; struct rcu_head rcu; struct batadv_hard_iface_bat_iv bat_iv; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_hard_iface_bat_v bat_v; +#endif struct work_struct cleanup_work; struct dentry *debug_dir; struct hlist_head neigh_list; @@ -125,10 +161,11 @@ struct batadv_hard_iface { /** * struct batadv_orig_ifinfo - originator info per outgoing interface * @list: list node for orig_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard interface + * @if_outgoing: pointer to outgoing hard-interface * @router: router that should be used to reach this originator * @last_real_seqno: last and best known sequence number * @last_ttl: ttl of last received packet + * @last_seqno_forwarded: seqno of the OGM which was forwarded last * @batman_seqno_reset: time when the batman seqno window was reset * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -139,8 +176,9 @@ struct batadv_orig_ifinfo { struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ u32 last_real_seqno; u8 last_ttl; + u32 last_seqno_forwarded; unsigned long batman_seqno_reset; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -196,13 +234,13 @@ struct batadv_orig_node_vlan { unsigned short vid; struct batadv_vlan_tt tt; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: set of bitfields (one per hard interface) where each one counts + * @bcast_own: set of bitfields (one per hard-interface) where each one counts * the number of our OGMs this orig_node rebroadcasted "back" to us (relative * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. * @bcast_own_sum: sum of bcast_own @@ -298,7 +336,7 @@ struct batadv_orig_node { struct batadv_priv *bat_priv; /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */ spinlock_t bcast_seqno_lock; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; #ifdef CONFIG_BATMAN_ADV_NC struct list_head in_coding_list; @@ -341,15 +379,36 @@ struct batadv_gw_node { struct batadv_orig_node *orig_node; u32 bandwidth_down; u32 bandwidth_up; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; +DECLARE_EWMA(throughput, 1024, 8) + +/** + * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor + * information + * @throughput: ewma link throughput towards this neighbor + * @elp_interval: time interval between two ELP transmissions + * @elp_latest_seqno: latest and best known ELP sequence number + * @last_unicast_tx: when the last unicast packet has been sent to this neighbor + * @metric_work: work queue callback item for metric update + */ +struct batadv_hardif_neigh_node_bat_v { + struct ewma_throughput throughput; + u32 elp_interval; + u32 elp_latest_seqno; + unsigned long last_unicast_tx; + struct work_struct metric_work; +}; + /** - * batadv_hardif_neigh_node - unique neighbor per hard interface + * struct batadv_hardif_neigh_node - unique neighbor per hard-interface * @list: list node for batadv_hard_iface::neigh_list * @addr: the MAC address of the neighboring interface - * @if_incoming: pointer to incoming hard interface + * @if_incoming: pointer to incoming hard-interface + * @last_seen: when last packet via this neighbor was received + * @bat_v: B.A.T.M.A.N. V private data * @refcount: number of contexts the object is used * @rcu: struct used for freeing in a RCU-safe manner */ @@ -358,7 +417,10 @@ struct batadv_hardif_neigh_node { u8 addr[ETH_ALEN]; struct batadv_hard_iface *if_incoming; unsigned long last_seen; - atomic_t refcount; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_hardif_neigh_node_bat_v bat_v; +#endif + struct kref refcount; struct rcu_head rcu; }; @@ -369,8 +431,9 @@ struct batadv_hardif_neigh_node { * @addr: the MAC address of the neighboring interface * @ifinfo_list: list for routing metrics per outgoing interface * @ifinfo_lock: lock protecting private ifinfo members and list - * @if_incoming: pointer to incoming hard interface + * @if_incoming: pointer to incoming hard-interface * @last_seen: when last packet via this neighbor was received + * @hardif_neigh: hardif_neigh of this neighbor * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -382,13 +445,14 @@ struct batadv_neigh_node { spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; unsigned long last_seen; - atomic_t refcount; + struct batadv_hardif_neigh_node *hardif_neigh; + struct kref refcount; struct rcu_head rcu; }; /** * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing - * interface for BATMAN IV + * interface for B.A.T.M.A.N. IV * @tq_recv: ring buffer of received TQ values from this neigh node * @tq_index: ring buffer index * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) @@ -405,10 +469,22 @@ struct batadv_neigh_ifinfo_bat_iv { }; /** + * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing + * interface for B.A.T.M.A.N. V + * @throughput: last throughput metric received from originator via this neigh + * @last_seqno: last sequence number known for this neighbor + */ +struct batadv_neigh_ifinfo_bat_v { + u32 throughput; + u32 last_seqno; +}; + +/** * struct batadv_neigh_ifinfo - neighbor information per outgoing interface * @list: list node for batadv_neigh_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard interface + * @if_outgoing: pointer to outgoing hard-interface * @bat_iv: B.A.T.M.A.N. IV private structure + * @bat_v: B.A.T.M.A.N. V private data * @last_ttl: last received ttl from this neigh node * @refcount: number of contexts the object is used * @rcu: struct used for freeing in a RCU-safe manner @@ -417,8 +493,11 @@ struct batadv_neigh_ifinfo { struct hlist_node list; struct batadv_hard_iface *if_outgoing; struct batadv_neigh_ifinfo_bat_iv bat_iv; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_neigh_ifinfo_bat_v bat_v; +#endif u8 last_ttl; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -744,11 +823,25 @@ struct batadv_softif_vlan { atomic_t ap_isolation; /* boolean */ struct batadv_vlan_tt tt; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** + * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data + * @ogm_buff: buffer holding the OGM packet + * @ogm_buff_len: length of the OGM packet buffer + * @ogm_seqno: OGM sequence number - used to identify each OGM + * @ogm_wq: workqueue used to schedule OGM transmissions + */ +struct batadv_priv_bat_v { + unsigned char *ogm_buff; + int ogm_buff_len; + atomic_t ogm_seqno; + struct delayed_work ogm_wq; +}; + +/** * struct batadv_priv - per mesh interface data * @mesh_state: current status of the mesh (inactive/active/deactivating) * @soft_iface: net device which holds this struct as private data @@ -771,6 +864,9 @@ struct batadv_softif_vlan { * @orig_interval: OGM broadcast interval in milliseconds * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop * @log_level: configured log level (see batadv_dbg_level) + * @isolation_mark: the skb->mark value used to match packets for AP isolation + * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used + * for the isolation mark * @bcast_seqno: last sent broadcast packet sequence number * @bcast_queue_left: number of remaining buffered broadcast packet slots * @batman_queue_left: number of remaining OGM packet slots @@ -783,8 +879,8 @@ struct batadv_softif_vlan { * @forw_bat_list_lock: lock protecting forw_bat_list * @forw_bcast_list_lock: lock protecting forw_bcast_list * @orig_work: work queue callback item for orig node purging - * @cleanup_work: work queue callback item for soft interface deinit - * @primary_if: one of the hard interfaces assigned to this mesh interface + * @cleanup_work: work queue callback item for soft-interface deinit + * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface * @bat_algo_ops: routing algorithm used by this mesh interface * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top @@ -799,6 +895,7 @@ struct batadv_softif_vlan { * @mcast: multicast data * @network_coding: bool indicating whether network coding is enabled * @nc: network coding data + * @bat_v: B.A.T.M.A.N. V per soft-interface private data */ struct batadv_priv { atomic_t mesh_state; @@ -864,6 +961,9 @@ struct batadv_priv { atomic_t network_coding; struct batadv_priv_nc nc; #endif /* CONFIG_BATMAN_ADV_NC */ +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct batadv_priv_bat_v bat_v; +#endif }; /** @@ -925,7 +1025,7 @@ struct batadv_bla_backbone_gw { atomic_t request_sent; u16 crc; spinlock_t crc_lock; /* protects crc */ - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -946,7 +1046,7 @@ struct batadv_bla_claim { unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; - atomic_t refcount; + struct kref refcount; }; #endif @@ -967,7 +1067,7 @@ struct batadv_tt_common_entry { struct hlist_node hash_entry; u16 flags; unsigned long added_at; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -975,10 +1075,12 @@ struct batadv_tt_common_entry { * struct batadv_tt_local_entry - translation table local entry data * @common: general translation table data * @last_seen: timestamp used for purging stale tt local entries + * @vlan: soft-interface vlan of the entry */ struct batadv_tt_local_entry { struct batadv_tt_common_entry common; unsigned long last_seen; + struct batadv_softif_vlan *vlan; }; /** @@ -1009,7 +1111,7 @@ struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1062,7 +1164,7 @@ struct batadv_tt_roam_node { struct batadv_nc_node { struct list_head list; u8 addr[ETH_ALEN]; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; struct batadv_orig_node *orig_node; unsigned long last_seen; @@ -1082,7 +1184,7 @@ struct batadv_nc_node { struct batadv_nc_path { struct hlist_node hash_entry; struct rcu_head rcu; - atomic_t refcount; + struct kref refcount; struct list_head packet_list; spinlock_t packet_list_lock; /* Protects packet_list */ u8 next_hop[ETH_ALEN]; @@ -1152,6 +1254,8 @@ struct batadv_forw_packet { * struct batadv_algo_ops - mesh algorithm callbacks * @list: list node for the batadv_algo_list * @name: name of the algorithm + * @bat_iface_activate: start routing mechanisms when hard-interface is brought + * up * @bat_iface_enable: init routing info when hard-interface is enabled * @bat_iface_disable: de-init routing info when hard-interface is disabled * @bat_iface_update_mac: (re-)init mac addresses of the protocol information @@ -1179,6 +1283,7 @@ struct batadv_forw_packet { struct batadv_algo_ops { struct hlist_node list; char *name; + void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface); int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); @@ -1225,7 +1330,7 @@ struct batadv_dat_entry { unsigned short vid; unsigned long last_update; struct hlist_node hash_entry; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1261,7 +1366,7 @@ struct batadv_dat_candidate { struct batadv_tvlv_container { struct hlist_node list; struct batadv_tvlv_hdr tvlv_hdr; - atomic_t refcount; + struct kref refcount; }; /** @@ -1288,7 +1393,7 @@ struct batadv_tvlv_handler { u8 type; u8 version; u8 flags; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 8a4cc2f7f..e45cb9155 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -431,15 +431,18 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { struct in6_addr ipv6_daddr; + struct ipv6hdr *hdr; struct lowpan_dev *dev; struct lowpan_peer *peer; bdaddr_t addr, *any = BDADDR_ANY; u8 *daddr = any->b; int err, status = 0; + hdr = ipv6_hdr(skb); + dev = lowpan_dev(netdev); - memcpy(&ipv6_daddr, &lowpan_cb(skb)->addr, sizeof(ipv6_daddr)); + memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; @@ -489,15 +492,9 @@ static int header_create(struct sk_buff *skb, struct net_device *netdev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { - struct ipv6hdr *hdr; - if (type != ETH_P_IPV6) return -EINVAL; - hdr = ipv6_hdr(skb); - - memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, sizeof(struct in6_addr)); - return 0; } diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 95d1a66ba..06c31b9a6 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -69,6 +69,15 @@ config BT_6LOWPAN help IPv6 compression over Bluetooth Low Energy. +config BT_LEDS + bool "Enable LED triggers" + depends on BT + depends on LEDS_CLASS + select LEDS_TRIGGERS + help + This option selects a few LED triggers for different + Bluetooth events. + config BT_SELFTEST bool "Bluetooth self testing support" depends on BT && DEBUG_KERNEL diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 2b15ae8c1..b3ff12eb9 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -17,6 +17,7 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o +bluetooth-$(CONFIG_BT_LEDS) += leds.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 32575b49f..bf9f8a801 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -719,6 +719,13 @@ done: hci_dev_unlock(hdev); } +static bool conn_use_rpa(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + return hci_dev_test_flag(hdev, HCI_PRIVACY); +} + static void hci_req_add_le_create_conn(struct hci_request *req, struct hci_conn *conn) { @@ -726,14 +733,15 @@ static void hci_req_add_le_create_conn(struct hci_request *req, struct hci_dev *hdev = conn->hdev; u8 own_addr_type; - memset(&cp, 0, sizeof(cp)); - /* Update random address, but set require_privacy to false so * that we never connect with an non-resolvable address. */ - if (hci_update_random_address(req, false, &own_addr_type)) + if (hci_update_random_address(req, false, conn_use_rpa(conn), + &own_addr_type)) return; + memset(&cp, 0, sizeof(cp)); + /* Set window to be the same value as the interval to enable * continuous scanning. */ @@ -774,7 +782,8 @@ static void hci_req_directed_advertising(struct hci_request *req, /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ - if (hci_update_random_address(req, false, &own_addr_type) < 0) + if (hci_update_random_address(req, false, conn_use_rpa(conn), + &own_addr_type) < 0) return; memset(&cp, 0, sizeof(cp)); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 883c821a9..2713fc86e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -40,6 +40,7 @@ #include "hci_request.h" #include "hci_debugfs.h" #include "smp.h" +#include "leds.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -1395,6 +1396,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); + hci_leds_update_powered(hdev, true); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && @@ -1532,6 +1534,8 @@ int hci_dev_do_close(struct hci_dev *hdev) return 0; } + hci_leds_update_powered(hdev, false); + /* Flush RX and TX works */ flush_work(&hdev->tx_work); flush_work(&hdev->rx_work); @@ -2017,6 +2021,7 @@ static void hci_power_on(struct work_struct *work) if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { + cancel_delayed_work(&hdev->power_off); hci_req_sync_lock(hdev); err = __hci_req_hci_power_on(hdev); hci_req_sync_unlock(hdev); @@ -3067,6 +3072,8 @@ int hci_register_dev(struct hci_dev *hdev) if (error < 0) goto err_wqueue; + hci_leds_init(hdev); + hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c78ee2dc9..6e125d76d 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -771,6 +771,11 @@ static u8 update_white_list(struct hci_request *req) return 0x01; } +static bool scan_use_rpa(struct hci_dev *hdev) +{ + return hci_dev_test_flag(hdev, HCI_PRIVACY); +} + void hci_req_add_le_passive_scan(struct hci_request *req) { struct hci_cp_le_set_scan_param param_cp; @@ -785,7 +790,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) * advertising with our address will be correctly reported * by the controller. */ - if (hci_update_random_address(req, false, &own_addr_type)) + if (hci_update_random_address(req, false, scan_use_rpa(hdev), + &own_addr_type)) return; /* Adding or removing entries from the white list must @@ -866,6 +872,11 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) flags |= MGMT_ADV_FLAG_CONNECTABLE; + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) + flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; + else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) + flags |= MGMT_ADV_FLAG_DISCOV; + return flags; } @@ -878,6 +889,29 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) return adv_instance->flags; } +static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) +{ + /* If privacy is not enabled don't use RPA */ + if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) + return false; + + /* If basic privacy mode is enabled use RPA */ + if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) + return true; + + /* If limited privacy mode is enabled don't use RPA if we're + * both discoverable and bondable. + */ + if ((flags & MGMT_ADV_FLAG_DISCOV) && + hci_dev_test_flag(hdev, HCI_BONDABLE)) + return false; + + /* We're neither bondable nor discoverable in the limited + * privacy mode, therefore use RPA. + */ + return true; +} + void __hci_req_enable_advertising(struct hci_request *req) { struct hci_dev *hdev = req->hdev; @@ -911,7 +945,9 @@ void __hci_req_enable_advertising(struct hci_request *req) * advertising is used. In that case it is fine to use a * non-resolvable private address. */ - if (hci_update_random_address(req, !connectable, &own_addr_type) < 0) + if (hci_update_random_address(req, !connectable, + adv_use_rpa(hdev, flags), + &own_addr_type) < 0) return; memset(&cp, 0, sizeof(cp)); @@ -1325,7 +1361,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) } int hci_update_random_address(struct hci_request *req, bool require_privacy, - u8 *own_addr_type) + bool use_rpa, u8 *own_addr_type) { struct hci_dev *hdev = req->hdev; int err; @@ -1334,7 +1370,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, * current RPA has expired or there is something else than * the current RPA in use, then generate a new one. */ - if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { + if (use_rpa) { int to; *own_addr_type = ADDR_LE_DEV_RANDOM; @@ -1596,9 +1632,16 @@ static int discoverable_update(struct hci_request *req, unsigned long opt) /* Advertising instances don't use the global discoverable setting, so * only update AD if advertising was enabled using Set Advertising. */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { __hci_req_update_adv_data(req, 0x00); + /* Discoverable mode affects the local advertising + * address in limited privacy mode. + */ + if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) + __hci_req_enable_advertising(req); + } + hci_dev_unlock(hdev); return 0; @@ -1941,7 +1984,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) * address (when privacy feature has been enabled) or non-resolvable * private address. */ - err = hci_update_random_address(req, true, &own_addr_type); + err = hci_update_random_address(req, true, scan_use_rpa(hdev), + &own_addr_type); if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 64ff8c040..b2d044bdc 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -89,7 +89,7 @@ static inline void hci_req_update_scan(struct hci_dev *hdev) void __hci_req_update_scan(struct hci_request *req); int hci_update_random_address(struct hci_request *req, bool require_privacy, - u8 *own_addr_type); + bool use_rpa, u8 *own_addr_type); int hci_abort_conn(struct hci_conn *conn, u8 reason); void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c new file mode 100644 index 000000000..8319c8440 --- /dev/null +++ b/net/bluetooth/leds.c @@ -0,0 +1,74 @@ +/* + * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> + +#include "leds.h" + +struct hci_basic_led_trigger { + struct led_trigger led_trigger; + struct hci_dev *hdev; +}; + +#define to_hci_basic_led_trigger(arg) container_of(arg, \ + struct hci_basic_led_trigger, led_trigger) + +void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) +{ + if (hdev->power_led) + led_trigger_event(hdev->power_led, + enabled ? LED_FULL : LED_OFF); +} + +static void power_activate(struct led_classdev *led_cdev) +{ + struct hci_basic_led_trigger *htrig; + bool powered; + + htrig = to_hci_basic_led_trigger(led_cdev->trigger); + powered = test_bit(HCI_UP, &htrig->hdev->flags); + + led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF); +} + +static struct led_trigger *led_allocate_basic(struct hci_dev *hdev, + void (*activate)(struct led_classdev *led_cdev), + const char *name) +{ + struct hci_basic_led_trigger *htrig; + + htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL); + if (!htrig) + return NULL; + + htrig->hdev = hdev; + htrig->led_trigger.activate = activate; + htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL, + "%s-%s", hdev->name, + name); + if (!htrig->led_trigger.name) + goto err_alloc; + + if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger)) + goto err_register; + + return &htrig->led_trigger; + +err_register: + devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name); +err_alloc: + devm_kfree(&hdev->dev, htrig); + return NULL; +} + +void hci_leds_init(struct hci_dev *hdev) +{ + /* initialize power_led */ + hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); +} diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h new file mode 100644 index 000000000..a9c4d6ea0 --- /dev/null +++ b/net/bluetooth/leds.h @@ -0,0 +1,16 @@ +/* + * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#if IS_ENABLED(CONFIG_BT_LEDS) +void hci_leds_update_powered(struct hci_dev *hdev, bool enabled); +void hci_leds_init(struct hci_dev *hdev); +#else +static inline void hci_leds_update_powered(struct hci_dev *hdev, + bool enabled) {} +static inline void hci_leds_init(struct hci_dev *hdev) {} +#endif diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1363b8ffd..9e4b93158 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 11 +#define MGMT_REVISION 12 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -1382,8 +1382,19 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data, if (err < 0) goto unlock; - if (changed) + if (changed) { + /* In limited privacy mode the change of bondable mode + * may affect the local advertising address. + */ + if (hdev_is_powered(hdev) && + hci_dev_test_flag(hdev, HCI_ADVERTISING) && + hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && + hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) + queue_work(hdev->req_workqueue, + &hdev->discoverable_update); + err = new_settings(hdev, sk); + } unlock: hci_dev_unlock(hdev); @@ -4423,7 +4434,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, MGMT_STATUS_NOT_SUPPORTED); - if (cp->privacy != 0x00 && cp->privacy != 0x01) + if (cp->privacy != 0x00 && cp->privacy != 0x01 && cp->privacy != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, MGMT_STATUS_INVALID_PARAMS); @@ -4442,10 +4453,15 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY); memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + if (cp->privacy == 0x02) + hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY); + else + hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); } else { changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY); memset(hdev->irk, 0, sizeof(hdev->irk)); hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); + hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); } err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4b175df35..50976a648 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -21,9 +21,10 @@ */ #include <linux/debugfs.h> -#include <linux/crypto.h> #include <linux/scatterlist.h> #include <crypto/b128ops.h> +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -87,8 +88,8 @@ struct smp_dev { u8 min_key_size; u8 max_key_size; - struct crypto_blkcipher *tfm_aes; - struct crypto_hash *tfm_cmac; + struct crypto_skcipher *tfm_aes; + struct crypto_shash *tfm_cmac; }; struct smp_chan { @@ -126,8 +127,8 @@ struct smp_chan { u8 dhkey[32]; u8 mackey[16]; - struct crypto_blkcipher *tfm_aes; - struct crypto_hash *tfm_cmac; + struct crypto_skcipher *tfm_aes; + struct crypto_shash *tfm_cmac; }; /* These debug key values are defined in the SMP section of the core @@ -165,12 +166,11 @@ static inline void swap_buf(const u8 *src, u8 *dst, size_t len) * AES-CMAC, f4, f5, f6, g2 and h6. */ -static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m, +static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m, size_t len, u8 mac[16]) { uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX]; - struct hash_desc desc; - struct scatterlist sg; + SHASH_DESC_ON_STACK(desc, tfm); int err; if (len > CMAC_MSG_MAX) @@ -181,10 +181,8 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m, return -EINVAL; } - desc.tfm = tfm; - desc.flags = 0; - - crypto_hash_init(&desc); + desc->tfm = tfm; + desc->flags = 0; /* Swap key and message from LSB to MSB */ swap_buf(k, tmp, 16); @@ -193,23 +191,16 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m, SMP_DBG("msg (len %zu) %*phN", len, (int) len, m); SMP_DBG("key %16phN", k); - err = crypto_hash_setkey(tfm, tmp, 16); + err = crypto_shash_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } - sg_init_one(&sg, msg_msb, len); - - err = crypto_hash_update(&desc, &sg, len); + err = crypto_shash_digest(desc, msg_msb, len, mac_msb); + shash_desc_zero(desc); if (err) { - BT_ERR("Hash update error %d", err); - return err; - } - - err = crypto_hash_final(&desc, mac_msb); - if (err) { - BT_ERR("Hash final error %d", err); + BT_ERR("Hash computation error %d", err); return err; } @@ -220,8 +211,8 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m, return 0; } -static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32], - const u8 x[16], u8 z, u8 res[16]) +static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32], + const u8 v[32], const u8 x[16], u8 z, u8 res[16]) { u8 m[65]; int err; @@ -243,7 +234,7 @@ static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32], return err; } -static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32], +static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32], const u8 n1[16], const u8 n2[16], const u8 a1[7], const u8 a2[7], u8 mackey[16], u8 ltk[16]) { @@ -296,7 +287,7 @@ static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32], return 0; } -static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16], +static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 n1[16], const u8 n2[16], const u8 r[16], const u8 io_cap[3], const u8 a1[7], const u8 a2[7], u8 res[16]) @@ -324,7 +315,7 @@ static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16], return err; } -static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32], +static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], const u8 y[16], u32 *val) { u8 m[80], tmp[16]; @@ -350,7 +341,7 @@ static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32], return 0; } -static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16], +static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 key_id[4], u8 res[16]) { int err; @@ -370,9 +361,9 @@ static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16], * s1 and ah. */ -static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) +static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) { - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, tfm); struct scatterlist sg; uint8_t tmp[16], data[16]; int err; @@ -384,13 +375,10 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) return -EINVAL; } - desc.tfm = tfm; - desc.flags = 0; - /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); - err = crypto_blkcipher_setkey(tfm, tmp, 16); + err = crypto_skcipher_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; @@ -401,7 +389,12 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) sg_init_one(&sg, data, 16); - err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16); + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, 16, NULL); + + err = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); if (err) BT_ERR("Encrypt data error %d", err); @@ -413,7 +406,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) return err; } -static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], +static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { @@ -462,7 +455,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], return err; } -static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16], +static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; @@ -478,7 +471,7 @@ static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16], return err; } -static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16], +static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; @@ -766,8 +759,8 @@ static void smp_chan_destroy(struct l2cap_conn *conn) kzfree(smp->slave_csrk); kzfree(smp->link_key); - crypto_free_blkcipher(smp->tfm_aes); - crypto_free_hash(smp->tfm_cmac); + crypto_free_skcipher(smp->tfm_aes); + crypto_free_shash(smp->tfm_cmac); /* Ensure that we don't leave any debug key around if debug key * support hasn't been explicitly enabled. @@ -1366,17 +1359,17 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create ECB crypto context"); kzfree(smp); return NULL; } - smp->tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); + smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_blkcipher(smp->tfm_aes); + crypto_free_skcipher(smp->tfm_aes); kzfree(smp); return NULL; } @@ -3127,8 +3120,8 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; - struct crypto_blkcipher *tfm_aes; - struct crypto_hash *tfm_cmac; + struct crypto_skcipher *tfm_aes; + struct crypto_shash *tfm_cmac; if (cid == L2CAP_CID_SMP_BREDR) { smp = NULL; @@ -3139,17 +3132,17 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (!smp) return ERR_PTR(-ENOMEM); - tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { BT_ERR("Unable to create ECB crypto context"); kzfree(smp); return ERR_CAST(tfm_aes); } - tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_blkcipher(tfm_aes); + crypto_free_skcipher(tfm_aes); kzfree(smp); return ERR_CAST(tfm_cmac); } @@ -3163,8 +3156,8 @@ create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { - crypto_free_blkcipher(smp->tfm_aes); - crypto_free_hash(smp->tfm_cmac); + crypto_free_skcipher(smp->tfm_aes); + crypto_free_shash(smp->tfm_cmac); kzfree(smp); } return ERR_PTR(-ENOMEM); @@ -3210,10 +3203,8 @@ static void smp_del_chan(struct l2cap_chan *chan) smp = chan->data; if (smp) { chan->data = NULL; - if (smp->tfm_aes) - crypto_free_blkcipher(smp->tfm_aes); - if (smp->tfm_cmac) - crypto_free_hash(smp->tfm_cmac); + crypto_free_skcipher(smp->tfm_aes); + crypto_free_shash(smp->tfm_cmac); kzfree(smp); } @@ -3449,7 +3440,7 @@ void smp_unregister(struct hci_dev *hdev) #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) -static int __init test_ah(struct crypto_blkcipher *tfm_aes) +static int __init test_ah(struct crypto_skcipher *tfm_aes) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, @@ -3469,7 +3460,7 @@ static int __init test_ah(struct crypto_blkcipher *tfm_aes) return 0; } -static int __init test_c1(struct crypto_blkcipher *tfm_aes) +static int __init test_c1(struct crypto_skcipher *tfm_aes) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3499,7 +3490,7 @@ static int __init test_c1(struct crypto_blkcipher *tfm_aes) return 0; } -static int __init test_s1(struct crypto_blkcipher *tfm_aes) +static int __init test_s1(struct crypto_skcipher *tfm_aes) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3524,7 +3515,7 @@ static int __init test_s1(struct crypto_blkcipher *tfm_aes) return 0; } -static int __init test_f4(struct crypto_hash *tfm_cmac) +static int __init test_f4(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, @@ -3556,7 +3547,7 @@ static int __init test_f4(struct crypto_hash *tfm_cmac) return 0; } -static int __init test_f5(struct crypto_hash *tfm_cmac) +static int __init test_f5(struct crypto_shash *tfm_cmac) { const u8 w[32] = { 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86, @@ -3593,7 +3584,7 @@ static int __init test_f5(struct crypto_hash *tfm_cmac) return 0; } -static int __init test_f6(struct crypto_hash *tfm_cmac) +static int __init test_f6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, @@ -3626,7 +3617,7 @@ static int __init test_f6(struct crypto_hash *tfm_cmac) return 0; } -static int __init test_g2(struct crypto_hash *tfm_cmac) +static int __init test_g2(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, @@ -3658,7 +3649,7 @@ static int __init test_g2(struct crypto_hash *tfm_cmac) return 0; } -static int __init test_h6(struct crypto_hash *tfm_cmac) +static int __init test_h6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, @@ -3695,8 +3686,8 @@ static const struct file_operations test_smp_fops = { .llseek = default_llseek, }; -static int __init run_selftests(struct crypto_blkcipher *tfm_aes, - struct crypto_hash *tfm_cmac) +static int __init run_selftests(struct crypto_skcipher *tfm_aes, + struct crypto_shash *tfm_cmac) { ktime_t calltime, delta, rettime; unsigned long long duration; @@ -3773,27 +3764,27 @@ done: int __init bt_selftest_smp(void) { - struct crypto_blkcipher *tfm_aes; - struct crypto_hash *tfm_cmac; + struct crypto_skcipher *tfm_aes; + struct crypto_shash *tfm_cmac; int err; - tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { BT_ERR("Unable to create ECB crypto context"); return PTR_ERR(tfm_aes); } - tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_blkcipher(tfm_aes); + crypto_free_skcipher(tfm_aes); return PTR_ERR(tfm_cmac); } err = run_selftests(tfm_aes, tfm_cmac); - crypto_free_hash(tfm_cmac); - crypto_free_blkcipher(tfm_aes); + crypto_free_shash(tfm_cmac); + crypto_free_skcipher(tfm_aes); return err; } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index fcdb86dd5..f47759f05 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -44,7 +44,6 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb_push(skb, ETH_HLEN); br_drop_fake_rtable(skb); - skb_sender_cpu_clear(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && (skb->protocol == htons(ETH_P_8021Q) || diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index c367b3e1b..8217aecf0 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -36,10 +36,10 @@ */ static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd; + struct ethtool_link_ksettings ecmd; - if (!__ethtool_get_settings(dev, &ecmd)) { - switch (ethtool_cmd_speed(&ecmd)) { + if (!__ethtool_get_link_ksettings(dev, &ecmd)) { + switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_1000: @@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head) destroy_nbp(p); } +static unsigned get_max_headroom(struct net_bridge *br) +{ + unsigned max_headroom = 0; + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); + + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + + return max_headroom; +} + +static void update_headroom(struct net_bridge *br, int new_hr) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) + netdev_set_rx_headroom(p->dev, new_hr); + + br->dev->needed_headroom = new_hr; +} + /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. @@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); list_del_rcu(&p->list); + if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) + update_headroom(br, get_max_headroom(br)); + netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); @@ -409,6 +437,20 @@ int br_min_mtu(const struct net_bridge *br) return mtu; } +static void br_set_gso_limits(struct net_bridge *br) +{ + unsigned int gso_max_size = GSO_MAX_SIZE; + u16 gso_max_segs = GSO_MAX_SEGS; + const struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + gso_max_size = min(gso_max_size, p->dev->gso_max_size); + gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs); + } + br->dev->gso_max_size = gso_max_size; + br->dev->gso_max_segs = gso_max_segs; +} + /* * Recomputes features using slave's features */ @@ -438,6 +480,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + unsigned br_hr, dev_hr; bool changed_addr; /* Don't allow bridging non-ethernet like devices, or DSA-enabled @@ -505,8 +548,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) netdev_update_features(br->dev); - if (br->dev->needed_headroom < dev->needed_headroom) - br->dev->needed_headroom = dev->needed_headroom; + br_hr = br->dev->needed_headroom; + dev_hr = netdev_get_fwd_headroom(dev); + if (br_hr < dev_hr) + update_headroom(br, dev_hr); + else + netdev_set_rx_headroom(dev, br_hr); if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); @@ -531,6 +578,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); dev_set_mtu(br->dev, br_min_mtu(br)); + br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -577,6 +625,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) del_nbp(p); dev_set_mtu(br->dev, br_min_mtu(br)); + br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f7fba7410..160797722 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -222,7 +222,10 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu /* check if vlan is allowed, to avoid spoofing */ if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); - return 0; /* process further */ + + BR_INPUT_SKB_CB(skb)->brdev = p->br->dev; + br_pass_frame_up(skb); + return 0; } /* @@ -284,14 +287,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } /* Deliver packet to local host only */ - if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - br_handle_local_finish)) { - return RX_HANDLER_CONSUMED; /* consumed by filter */ - } else { - *pskb = skb; - return RX_HANDLER_PASS; /* continue processing */ - } + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), + NULL, skb, skb->dev, NULL, br_handle_local_finish); + return RX_HANDLER_CONSUMED; } forward: diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 263b4de4d..60a3dbfca 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -21,18 +21,19 @@ #include <asm/uaccess.h> #include "br_private.h" -/* called with RTNL */ static int get_bridge_ifindices(struct net *net, int *indices, int num) { struct net_device *dev; int i = 0; - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { if (i >= num) break; if (dev->priv_flags & IFF_EBRIDGE) indices[i++] = dev->ifindex; } + rcu_read_unlock(); return i; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 74c278e00..7dbc80d01 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; - struct nlattr *nest; + struct nlattr *nest, *port_nest; if (!br->multicast_router || hlist_empty(&br->router_list)) return 0; @@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, return -EMSGSIZE; hlist_for_each_entry_rcu(p, &br->router_list, rlist) { - if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex)) + if (!p) + continue; + port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT); + if (!port_nest) + goto fail; + if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || + nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, + br_timer_value(&p->multicast_router_timer)) || + nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, + p->multicast_router)) { + nla_nest_cancel(skb, port_nest); goto fail; + } + nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); @@ -41,6 +53,27 @@ fail: return -EMSGSIZE; } +static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) +{ + e->state = flags & MDB_PG_FLAGS_PERMANENT; + e->flags = 0; + if (flags & MDB_PG_FLAGS_OFFLOAD) + e->flags |= MDB_FLAGS_OFFLOAD; +} + +static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) +{ + memset(ip, 0, sizeof(struct br_ip)); + ip->vid = entry->vid; + ip->proto = entry->addr.proto; + if (ip->proto == htons(ETH_P_IP)) + ip->u.ip4 = entry->addr.u.ip4; +#if IS_ENABLED(CONFIG_IPV6) + else + ip->u.ip6 = entry->addr.u.ip6; +#endif +} + static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { @@ -80,26 +113,41 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + port = p->port; - if (port) { - struct br_mdb_entry e; - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.state = p->state; - e.vid = p->addr.vid; - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, + MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest2); skip: @@ -208,9 +256,45 @@ static inline size_t rtnl_mdb_nlmsg_size(void) + nla_total_size(sizeof(struct br_mdb_entry)); } -static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, - int type) +struct br_mdb_complete_info { + struct net_bridge_port *port; + struct br_ip ip; +}; + +static void br_mdb_complete(struct net_device *dev, int err, void *priv) +{ + struct br_mdb_complete_info *data = priv; + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct net_bridge_port *port = data->port; + struct net_bridge *br = port->br; + + if (err) + goto err; + + spin_lock_bh(&br->multicast_lock); + mdb = mlock_dereference(br->mdb, br); + mp = br_mdb_ip_get(mdb, &data->ip); + if (!mp) + goto out; + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p->port != port) + continue; + p->flags |= MDB_PG_FLAGS_OFFLOAD; + } +out: + spin_unlock_bh(&br->multicast_lock); +err: + kfree(priv); +} + +static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, + struct br_mdb_entry *entry, int type) { + struct br_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_PORT_MDB, @@ -232,10 +316,18 @@ static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, #endif mdb.obj.orig_dev = port_dev; - if (port_dev && type == RTM_NEWMDB) - switchdev_port_obj_add(port_dev, &mdb.obj); - else if (port_dev && type == RTM_DELMDB) + if (port_dev && type == RTM_NEWMDB) { + complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); + if (complete_info) { + complete_info->port = p; + __mdb_entry_to_br_ip(entry, &complete_info->ip); + mdb.obj.complete_priv = complete_info; + mdb.obj.complete = br_mdb_complete; + switchdev_port_obj_add(port_dev, &mdb.obj); + } + } else if (port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); + } skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); if (!skb) @@ -254,7 +346,7 @@ errout: } void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 state) + struct br_ip *group, int type, u8 flags) { struct br_mdb_entry entry; @@ -265,9 +357,9 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, #if IS_ENABLED(CONFIG_IPV6) entry.addr.u.ip6 = group->u.ip6; #endif - entry.state = state; entry.vid = group->vid; - __br_mdb_notify(dev, &entry, type); + __mdb_entry_fill_flags(&entry, flags); + __br_mdb_notify(dev, port, &entry, type); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, @@ -468,15 +560,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; - memset(&ip, 0, sizeof(ip)); - ip.vid = entry->vid; - ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) - ip.u.ip4 = entry->addr.u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - else - ip.u.ip6 = entry->addr.u.ip6; -#endif + __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); ret = br_mdb_add_group(br, p, &ip, entry->state); @@ -519,12 +603,12 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) err = __br_mdb_add(net, br, entry); if (err) break; - __br_mdb_notify(dev, entry, RTM_NEWMDB); + __br_mdb_notify(dev, p, entry, RTM_NEWMDB); } } else { err = __br_mdb_add(net, br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_NEWMDB); + __br_mdb_notify(dev, p, entry, RTM_NEWMDB); } return err; @@ -542,15 +626,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || br->multicast_disabled) return -EINVAL; - memset(&ip, 0, sizeof(ip)); - ip.vid = entry->vid; - ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) - ip.u.ip4 = entry->addr.u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - else - ip.u.ip6 = entry->addr.u.ip6; -#endif + __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -568,7 +644,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (p->port->state == BR_STATE_DISABLED) goto unlock; - entry->state = p->state; + __mdb_entry_fill_flags(entry, p->flags); rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); @@ -620,12 +696,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) entry->vid = v->vid; err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + __br_mdb_notify(dev, p, entry, RTM_DELMDB); } } else { err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + __br_mdb_notify(dev, p, entry, RTM_DELMDB); } return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 03661d974..6852f3c70 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -284,7 +284,7 @@ static void br_multicast_del_pg(struct net_bridge *br, hlist_del_init(&p->mglist); del_timer(&p->timer); br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, - p->state); + p->flags); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && @@ -304,7 +304,7 @@ static void br_multicast_port_group_expired(unsigned long data) spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || - hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT) + hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; br_multicast_del_pg(br, pg); @@ -649,7 +649,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char state) + unsigned char flags) { struct net_bridge_port_group *p; @@ -659,7 +659,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->addr = *group; p->port = port; - p->state = state; + p->flags = flags; rcu_assign_pointer(p->next, next); hlist_add_head(&p->mglist, &port->mglist); setup_timer(&p->timer, br_multicast_port_group_expired, @@ -702,11 +702,11 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY); + p = br_multicast_new_port_group(port, group, *pp, 0); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY); + br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -760,13 +760,17 @@ static void br_multicast_router_expired(unsigned long data) struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); - if (port->multicast_router != 1 || + if (port->multicast_router == MDB_RTR_TYPE_DISABLED || + port->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(&port->multicast_router_timer) || hlist_unhashed(&port->rlist)) goto out; hlist_del_init_rcu(&port->rlist); br_rtr_notify(br->dev, port, RTM_DELMDB); + /* Don't allow timer refresh if the router expired */ + if (port->multicast_router == MDB_RTR_TYPE_TEMP) + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; out: spin_unlock(&br->multicast_lock); @@ -913,7 +917,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) void br_multicast_add_port(struct net_bridge_port *port) { - port->multicast_router = 1; + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; setup_timer(&port->multicast_router_timer, br_multicast_router_expired, (unsigned long)port); @@ -960,7 +964,8 @@ void br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif - if (port->multicast_router == 2 && hlist_unhashed(&port->rlist)) + if (port->multicast_router == MDB_RTR_TYPE_PERM && + hlist_unhashed(&port->rlist)) br_multicast_add_router(br, port); out: @@ -975,12 +980,15 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - if (pg->state == MDB_TEMPORARY) + if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(br, pg); if (!hlist_unhashed(&port->rlist)) { hlist_del_init_rcu(&port->rlist); br_rtr_notify(br->dev, port, RTM_DELMDB); + /* Don't allow timer refresh if disabling */ + if (port->multicast_router == MDB_RTR_TYPE_TEMP) + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } del_timer(&port->multicast_router_timer); del_timer(&port->ip4_own_query.timer); @@ -1228,13 +1236,14 @@ static void br_multicast_mark_router(struct net_bridge *br, unsigned long now = jiffies; if (!port) { - if (br->multicast_router == 1) + if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) mod_timer(&br->multicast_router_timer, now + br->multicast_querier_interval); return; } - if (port->multicast_router != 1) + if (port->multicast_router == MDB_RTR_TYPE_DISABLED || + port->multicast_router == MDB_RTR_TYPE_PERM) return; br_multicast_add_router(br, port); @@ -1270,6 +1279,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; + unsigned int offset = skb_transport_offset(skb); __be32 group; int err = 0; @@ -1280,14 +1290,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, group = ih->group; - if (skb->len == sizeof(*ih)) { + if (skb->len == offset + sizeof(*ih)) { max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); if (!max_delay) { max_delay = 10 * HZ; group = 0; } - } else if (skb->len >= sizeof(*ih3)) { + } else if (skb->len >= offset + sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; @@ -1348,6 +1358,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; + unsigned int offset = skb_transport_offset(skb); const struct in6_addr *group = NULL; bool is_general_query; int err = 0; @@ -1357,8 +1368,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - if (skb->len == sizeof(*mld)) { - if (!pskb_may_pull(skb, sizeof(*mld))) { + if (skb->len == offset + sizeof(*mld)) { + if (!pskb_may_pull(skb, offset + sizeof(*mld))) { err = -EINVAL; goto out; } @@ -1367,7 +1378,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (max_delay) group = &mld->mld_mca; } else { - if (!pskb_may_pull(skb, sizeof(*mld2q))) { + if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) { err = -EINVAL; goto out; } @@ -1454,7 +1465,7 @@ br_multicast_leave_group(struct net_bridge *br, del_timer(&p->timer); call_rcu_bh(&p->rcu, br_multicast_free_pg); br_mdb_notify(br->dev, port, group, RTM_DELMDB, - p->state); + p->flags); if (!mp->ports && !mp->mglist && netif_running(br->dev)) @@ -1715,7 +1726,7 @@ void br_multicast_init(struct net_bridge *br) br->hash_elasticity = 4; br->hash_max = 512; - br->multicast_router = 1; + br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_querier = 0; br->multicast_query_use_ifaddr = 0; br->multicast_last_member_count = 2; @@ -1825,11 +1836,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) spin_lock_bh(&br->multicast_lock); switch (val) { - case 0: - case 2: + case MDB_RTR_TYPE_DISABLED: + case MDB_RTR_TYPE_PERM: del_timer(&br->multicast_router_timer); /* fall through */ - case 1: + case MDB_RTR_TYPE_TEMP_QUERY: br->multicast_router = val; err = 0; break; @@ -1840,37 +1851,53 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) return err; } +static void __del_port_router(struct net_bridge_port *p) +{ + if (hlist_unhashed(&p->rlist)) + return; + hlist_del_init_rcu(&p->rlist); + br_rtr_notify(p->br->dev, p, RTM_DELMDB); +} + int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { struct net_bridge *br = p->br; + unsigned long now = jiffies; int err = -EINVAL; spin_lock(&br->multicast_lock); - - switch (val) { - case 0: - case 1: - case 2: - p->multicast_router = val; + if (p->multicast_router == val) { + /* Refresh the temp router port timer */ + if (p->multicast_router == MDB_RTR_TYPE_TEMP) + mod_timer(&p->multicast_router_timer, + now + br->multicast_querier_interval); err = 0; - - if (val < 2 && !hlist_unhashed(&p->rlist)) { - hlist_del_init_rcu(&p->rlist); - br_rtr_notify(br->dev, p, RTM_DELMDB); - } - - if (val == 1) - break; - + goto unlock; + } + switch (val) { + case MDB_RTR_TYPE_DISABLED: + p->multicast_router = MDB_RTR_TYPE_DISABLED; + __del_port_router(p); + del_timer(&p->multicast_router_timer); + break; + case MDB_RTR_TYPE_TEMP_QUERY: + p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + __del_port_router(p); + break; + case MDB_RTR_TYPE_PERM: + p->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&p->multicast_router_timer); - - if (val == 0) - break; - br_multicast_add_router(br, p); break; + case MDB_RTR_TYPE_TEMP: + p->multicast_router = MDB_RTR_TYPE_TEMP; + br_multicast_mark_router(br, p); + break; + default: + goto unlock; } - + err = 0; +unlock: spin_unlock(&br->multicast_lock); return err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 7ddbe7ec8..44114a94c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -37,6 +37,7 @@ #include <net/addrconf.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> +#include <net/netns/generic.h> #include <asm/uaccess.h> #include "br_private.h" @@ -44,6 +45,12 @@ #include <linux/sysctl.h> #endif +static int brnf_net_id __read_mostly; + +struct brnf_net { + bool enabled; +}; + #ifdef CONFIG_SYSCTL static struct ctl_table_header *brnf_sysctl_header; static int brnf_call_iptables __read_mostly = 1; @@ -938,6 +945,53 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { }, }; +static int brnf_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct brnf_net *brnet; + struct net *net; + int ret; + + if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE)) + return NOTIFY_DONE; + + ASSERT_RTNL(); + + net = dev_net(dev); + brnet = net_generic(net, brnf_net_id); + if (brnet->enabled) + return NOTIFY_OK; + + ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret) + return NOTIFY_BAD; + + brnet->enabled = true; + return NOTIFY_OK; +} + +static void __net_exit brnf_exit_net(struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + if (!brnet->enabled) + return; + + nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + brnet->enabled = false; +} + +static struct pernet_operations brnf_net_ops __read_mostly = { + .exit = brnf_exit_net, + .id = &brnf_net_id, + .size = sizeof(struct brnf_net), +}; + +static struct notifier_block brnf_notifier __read_mostly = { + .notifier_call = brnf_device_event, +}; + #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, @@ -1003,16 +1057,23 @@ static int __init br_netfilter_init(void) { int ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; + ret = register_netdevice_notifier(&brnf_notifier); + if (ret < 0) { + unregister_pernet_subsys(&brnf_net_ops); + return ret; + } + #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + unregister_netdevice_notifier(&brnf_notifier); + unregister_pernet_subsys(&brnf_net_ops); return -ENOMEM; } #endif @@ -1024,7 +1085,8 @@ static int __init br_netfilter_init(void) static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + unregister_netdevice_notifier(&brnf_notifier); + unregister_pernet_subsys(&brnf_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 40197ff89..e9c635eae 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -598,7 +598,6 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) return -ENETDOWN; br_set_state(p, state); - br_log_state(p); br_port_state_selection(p->br); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 216018c76..d9da85718 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -150,6 +150,9 @@ struct net_bridge_fdb_entry struct rcu_head rcu; }; +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) + struct net_bridge_port_group { struct net_bridge_port *port; struct net_bridge_port_group __rcu *next; @@ -157,7 +160,7 @@ struct net_bridge_port_group { struct rcu_head rcu; struct timer_list timer; struct br_ip addr; - unsigned char state; + unsigned char flags; }; struct net_bridge_mdb_entry @@ -554,11 +557,11 @@ void br_multicast_free_pg(struct rcu_head *head); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char state); + unsigned char flags); void br_mdb_init(void); void br_mdb_uninit(void); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 state); + struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); @@ -897,7 +900,6 @@ static inline void br_nf_core_fini(void) {} #endif /* br_stp.c */ -void br_log_state(const struct net_bridge_port *p); void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index e2670c5ba..9cb7044d0 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -30,13 +30,6 @@ static const char *const br_port_state_names[] = { [BR_STATE_BLOCKING] = "blocking", }; -void br_log_state(const struct net_bridge_port *p) -{ - br_info(p->br, "port %u(%s) entered %s state\n", - (unsigned int) p->port_no, p->dev->name, - br_port_state_names[p->state]); -} - void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { @@ -52,6 +45,10 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); + else + br_info(p->br, "port %u(%s) entered %s state\n", + (unsigned int) p->port_no, p->dev->name, + br_port_state_names[p->state]); } /* called under bridge lock */ @@ -126,7 +123,6 @@ static void br_root_port_block(const struct net_bridge *br, (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay > 0) @@ -407,7 +403,6 @@ static void br_make_blocking(struct net_bridge_port *p) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->forward_delay_timer); @@ -431,7 +426,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay != 0) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a31ac6ad7..984d46263 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -102,7 +102,6 @@ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); } @@ -118,7 +117,6 @@ void br_stp_disable_port(struct net_bridge_port *p) p->topology_change_ack = 0; p->config_pending = 0; - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->message_age_timer); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 5f0f5af0e..da058b85a 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -98,7 +98,6 @@ static void br_forward_delay_timer_expired(unsigned long arg) br_topology_change_detection(br); netif_carrier_on(br->dev); } - br_log_state(p); rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); rcu_read_unlock(); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 85e43af4a..9309bb4f2 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -955,6 +955,13 @@ err_rhtbl: */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { + struct switchdev_obj_port_vlan v = { + .obj.orig_dev = port->dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; struct net_bridge_vlan *vlan; int ret; @@ -962,6 +969,10 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { + /* Pass the flags to the hardware bridge */ + ret = switchdev_port_obj_add(port->dev, &v.obj); + if (ret && ret != -EOPNOTSUPP) + return ret; __vlan_add_flags(vlan, flags); return 0; } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 67b2e2799..5a61f3541 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -370,7 +370,11 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, left - sizeof(struct ebt_entry_match) < m->match_size) return -EINVAL; - match = xt_request_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { + request_module("ebt_%s", m->u.name); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + } if (IS_ERR(match)) return PTR_ERR(match); m->u.match = match; @@ -1521,6 +1525,8 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; + tmp.name[sizeof(tmp.name) - 1] = '\0'; + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; @@ -2332,6 +2338,8 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; + tmp.name[sizeof(tmp.name) - 1] = '\0'; + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index fdba3d9fb..77f7e7a9e 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -40,7 +40,8 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ -static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, +static void nft_reject_br_send_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, const struct net_device *dev, int hook) { @@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->ttl = sysctl_ip_default_ttl; + niph->ttl = net->ipv4.sysctl_ip_default_ttl; niph->tot_len = htons(nskb->len); ip_send_check(niph); @@ -74,7 +75,8 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, br_deliver(br_port_get_rcu(dev), nskb); } -static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, +static void nft_reject_br_send_v4_unreach(struct net *net, + struct sk_buff *oldskb, const struct net_device *dev, int hook, u8 code) { @@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); skb_reset_transport_header(nskb); icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); @@ -271,17 +273,17 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, case htons(ETH_P_IP): switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->hook, + nft_reject_br_send_v4_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, priv->icmp_code); break; case NFT_REJECT_TCP_RST: - nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in, - pkt->hook); + nft_reject_br_send_v4_tcp_reset(pkt->net, pkt->skb, + pkt->in, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->hook, + nft_reject_br_send_v4_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, nft_reject_icmp_code(priv->icmp_code)); break; } diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index f6c3b2137..59ce1fcc2 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -286,7 +286,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len) else skb_trim(skb, len); - return cfpkt_getlen(pkt); + return cfpkt_getlen(pkt); } /* Need to expand SKB */ diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 6b923bcaa..2bc5965fd 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -293,13 +293,9 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac, } EXPORT_SYMBOL(ceph_auth_create_authorizer); -void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) +void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) { - mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, a); - mutex_unlock(&ac->mutex); + a->destroy(a); } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 8c93fa8d8..5f836f02a 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -16,7 +16,6 @@ static void reset(struct ceph_auth_client *ac) struct ceph_auth_none_info *xi = ac->private; xi->starting = true; - xi->built_authorizer = false; } static void destroy(struct ceph_auth_client *ac) @@ -39,6 +38,27 @@ static int should_authenticate(struct ceph_auth_client *ac) return xi->starting; } +static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac, + struct ceph_none_authorizer *au) +{ + void *p = au->buf; + void *const end = p + sizeof(au->buf); + int ret; + + ceph_encode_8_safe(&p, end, 1, e_range); + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + return ret; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + au->buf_len = p - (void *)au->buf; + dout("%s built authorizer len %d\n", __func__, au->buf_len); + return 0; + +e_range: + return -ERANGE; +} + static int build_request(struct ceph_auth_client *ac, void *buf, void *end) { return 0; @@ -57,32 +77,32 @@ static int handle_reply(struct ceph_auth_client *ac, int result, return result; } +static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a) +{ + kfree(a); +} + /* - * build an 'authorizer' with our entity_name and global_id. we can - * reuse a single static copy since it is identical for all services - * we connect to. + * build an 'authorizer' with our entity_name and global_id. it is + * identical for all services we connect to. */ static int ceph_auth_none_create_authorizer( struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) { - struct ceph_auth_none_info *ai = ac->private; - struct ceph_none_authorizer *au = &ai->au; - void *p, *end; + struct ceph_none_authorizer *au; int ret; - if (!ai->built_authorizer) { - p = au->buf; - end = p + sizeof(au->buf); - ceph_encode_8(&p, 1); - ret = ceph_entity_name_encode(ac->name, &p, end - 8); - if (ret < 0) - goto bad; - ceph_decode_need(&p, end, sizeof(u64), bad2); - ceph_encode_64(&p, ac->global_id); - au->buf_len = p - (void *)au->buf; - ai->built_authorizer = true; - dout("built authorizer len %d\n", au->buf_len); + au = kmalloc(sizeof(*au), GFP_NOFS); + if (!au) + return -ENOMEM; + + au->base.destroy = ceph_auth_none_destroy_authorizer; + + ret = ceph_auth_none_build_authorizer(ac, au); + if (ret) { + kfree(au); + return ret; } auth->authorizer = (struct ceph_authorizer *) au; @@ -92,17 +112,6 @@ static int ceph_auth_none_create_authorizer( auth->authorizer_reply_buf_len = sizeof (au->reply_buf); return 0; - -bad2: - ret = -ERANGE; -bad: - return ret; -} - -static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - /* nothing to do */ } static const struct ceph_auth_client_ops ceph_auth_none_ops = { @@ -114,7 +123,6 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = { .build_request = build_request, .handle_reply = handle_reply, .create_authorizer = ceph_auth_none_create_authorizer, - .destroy_authorizer = ceph_auth_none_destroy_authorizer, }; int ceph_auth_none_init(struct ceph_auth_client *ac) @@ -127,7 +135,6 @@ int ceph_auth_none_init(struct ceph_auth_client *ac) return -ENOMEM; xi->starting = true; - xi->built_authorizer = false; ac->protocol = CEPH_AUTH_NONE; ac->private = xi; diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h index 059a3ce4b..62021535a 100644 --- a/net/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -12,6 +12,7 @@ */ struct ceph_none_authorizer { + struct ceph_authorizer base; char buf[128]; int buf_len; char reply_buf[0]; @@ -19,8 +20,6 @@ struct ceph_none_authorizer { struct ceph_auth_none_info { bool starting; - bool built_authorizer; - struct ceph_none_authorizer au; /* we only need one; it's static */ }; int ceph_auth_none_init(struct ceph_auth_client *ac); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 9e43a315e..a0905f04b 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -565,6 +565,14 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } +static void ceph_x_destroy_authorizer(struct ceph_authorizer *a) +{ + struct ceph_x_authorizer *au = (void *)a; + + ceph_x_authorizer_cleanup(au); + kfree(au); +} + static int ceph_x_create_authorizer( struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) @@ -581,6 +589,8 @@ static int ceph_x_create_authorizer( if (!au) return -ENOMEM; + au->base.destroy = ceph_x_destroy_authorizer; + ret = ceph_x_build_authorizer(ac, th, au); if (ret) { kfree(au); @@ -643,16 +653,6 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, return ret; } -static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - struct ceph_x_authorizer *au = (void *)a; - - ceph_x_authorizer_cleanup(au); - kfree(au); -} - - static void ceph_x_reset(struct ceph_auth_client *ac) { struct ceph_x_info *xi = ac->private; @@ -770,7 +770,6 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .create_authorizer = ceph_x_create_authorizer, .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, - .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, .reset = ceph_x_reset, .destroy = ceph_x_destroy, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index 40b1a3cf7..21a5af904 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -26,6 +26,7 @@ struct ceph_x_ticket_handler { struct ceph_x_authorizer { + struct ceph_authorizer base; struct ceph_crypto_key session_key; struct ceph_buffer *buf; unsigned int service; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index bcbec33c6..dcc18c6f7 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -361,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name, opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; - opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ @@ -686,6 +685,9 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) return client->auth_err; } + pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid); + ceph_debugfs_client_init(client); + return 0; } EXPORT_SYMBOL(__ceph_open_session); diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 42e8649c6..db2847ac5 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -4,7 +4,8 @@ #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/slab.h> -#include <crypto/hash.h> +#include <crypto/aes.h> +#include <crypto/skcipher.h> #include <linux/key-type.h> #include <keys/ceph-type.h> @@ -79,9 +80,9 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) return 0; } -static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) +static struct crypto_skcipher *ceph_crypto_alloc_cipher(void) { - return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); + return crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); } static const u8 *aes_iv = (u8 *)CEPH_AES_IV; @@ -162,11 +163,10 @@ static int ceph_aes_encrypt(const void *key, int key_len, { struct scatterlist sg_in[2], prealloc_sg; struct sg_table sg_out; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher(); + SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - void *iv; - int ivsize; + char iv[AES_BLOCK_SIZE]; size_t zero_padding = (0x10 - (src_len & 0x0f)); char pad[16]; @@ -184,10 +184,13 @@ static int ceph_aes_encrypt(const void *key, int key_len, if (ret) goto out_tfm; - crypto_blkcipher_setkey((void *)tfm, key, key_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); + crypto_skcipher_setkey((void *)tfm, key, key_len); + memcpy(iv, aes_iv, AES_BLOCK_SIZE); + + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_in, sg_out.sgl, + src_len + zero_padding, iv); /* print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, @@ -197,8 +200,8 @@ static int ceph_aes_encrypt(const void *key, int key_len, print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, pad, zero_padding, 1); */ - ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in, - src_len + zero_padding); + ret = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); if (ret < 0) { pr_err("ceph_aes_crypt failed %d\n", ret); goto out_sg; @@ -211,7 +214,7 @@ static int ceph_aes_encrypt(const void *key, int key_len, out_sg: teardown_sgtable(&sg_out); out_tfm: - crypto_free_blkcipher(tfm); + crypto_free_skcipher(tfm); return ret; } @@ -222,11 +225,10 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, { struct scatterlist sg_in[3], prealloc_sg; struct sg_table sg_out; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher(); + SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - void *iv; - int ivsize; + char iv[AES_BLOCK_SIZE]; size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f)); char pad[16]; @@ -245,10 +247,13 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, if (ret) goto out_tfm; - crypto_blkcipher_setkey((void *)tfm, key, key_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); + crypto_skcipher_setkey((void *)tfm, key, key_len); + memcpy(iv, aes_iv, AES_BLOCK_SIZE); + + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_in, sg_out.sgl, + src1_len + src2_len + zero_padding, iv); /* print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, @@ -260,8 +265,8 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, pad, zero_padding, 1); */ - ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in, - src1_len + src2_len + zero_padding); + ret = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); if (ret < 0) { pr_err("ceph_aes_crypt2 failed %d\n", ret); goto out_sg; @@ -274,7 +279,7 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, out_sg: teardown_sgtable(&sg_out); out_tfm: - crypto_free_blkcipher(tfm); + crypto_free_skcipher(tfm); return ret; } @@ -284,11 +289,10 @@ static int ceph_aes_decrypt(const void *key, int key_len, { struct sg_table sg_in; struct scatterlist sg_out[2], prealloc_sg; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm }; + struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher(); + SKCIPHER_REQUEST_ON_STACK(req, tfm); char pad[16]; - void *iv; - int ivsize; + char iv[AES_BLOCK_SIZE]; int ret; int last_byte; @@ -302,10 +306,13 @@ static int ceph_aes_decrypt(const void *key, int key_len, if (ret) goto out_tfm; - crypto_blkcipher_setkey((void *)tfm, key, key_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); + crypto_skcipher_setkey((void *)tfm, key, key_len); + memcpy(iv, aes_iv, AES_BLOCK_SIZE); + + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_in.sgl, sg_out, + src_len, iv); /* print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, @@ -313,7 +320,8 @@ static int ceph_aes_decrypt(const void *key, int key_len, print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, src, src_len, 1); */ - ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len); + ret = crypto_skcipher_decrypt(req); + skcipher_request_zero(req); if (ret < 0) { pr_err("ceph_aes_decrypt failed %d\n", ret); goto out_sg; @@ -338,7 +346,7 @@ static int ceph_aes_decrypt(const void *key, int key_len, out_sg: teardown_sgtable(&sg_in); out_tfm: - crypto_free_blkcipher(tfm); + crypto_free_skcipher(tfm); return ret; } @@ -349,11 +357,10 @@ static int ceph_aes_decrypt2(const void *key, int key_len, { struct sg_table sg_in; struct scatterlist sg_out[3], prealloc_sg; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm }; + struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher(); + SKCIPHER_REQUEST_ON_STACK(req, tfm); char pad[16]; - void *iv; - int ivsize; + char iv[AES_BLOCK_SIZE]; int ret; int last_byte; @@ -368,10 +375,13 @@ static int ceph_aes_decrypt2(const void *key, int key_len, if (ret) goto out_tfm; - crypto_blkcipher_setkey((void *)tfm, key, key_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); + crypto_skcipher_setkey((void *)tfm, key, key_len); + memcpy(iv, aes_iv, AES_BLOCK_SIZE); + + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg_in.sgl, sg_out, + src_len, iv); /* print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, @@ -379,7 +389,8 @@ static int ceph_aes_decrypt2(const void *key, int key_len, print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, src, src_len, 1); */ - ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len); + ret = crypto_skcipher_decrypt(req); + skcipher_request_zero(req); if (ret < 0) { pr_err("ceph_aes_decrypt failed %d\n", ret); goto out_sg; @@ -415,7 +426,7 @@ static int ceph_aes_decrypt2(const void *key, int key_len, out_sg: teardown_sgtable(&sg_in); out_tfm: - crypto_free_blkcipher(tfm); + crypto_free_skcipher(tfm); return ret; } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 593dc2eab..b902fbc78 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p) struct ceph_mon_generic_request *req; struct ceph_mon_client *monc = &client->monc; struct rb_node *rp; + int i; mutex_lock(&monc->mutex); - if (monc->have_mdsmap) - seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); - if (monc->have_osdmap) - seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); - if (monc->want_next_osdmap) - seq_printf(s, "want next osdmap\n"); + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { + seq_printf(s, "have %s %u", ceph_sub_str[i], + monc->subs[i].have); + if (monc->subs[i].want) + seq_printf(s, " want %llu%s", + le64_to_cpu(monc->subs[i].item.start), + (monc->subs[i].item.flags & + CEPH_SUBSCRIBE_ONETIME ? "" : "+")); + seq_putc(s, '\n'); + } for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { __u16 op; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9382619a4..a5502898e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -235,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq; static int ceph_msgr_slab_init(void) { BUG_ON(ceph_msg_cache); - ceph_msg_cache = kmem_cache_create("ceph_msg", - sizeof (struct ceph_msg), - __alignof__(struct ceph_msg), 0, NULL); - + ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); if (!ceph_msg_cache) return -ENOMEM; BUG_ON(ceph_msg_data_cache); - ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", - sizeof (struct ceph_msg_data), - __alignof__(struct ceph_msg_data), - 0, NULL); + ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); if (ceph_msg_data_cache) return 0; @@ -275,7 +269,7 @@ static void _ceph_msgr_exit(void) } BUG_ON(zero_page == NULL); - page_cache_release(zero_page); + put_page(zero_page); zero_page = NULL; ceph_msgr_slab_exit(); @@ -288,7 +282,7 @@ int ceph_msgr_init(void) BUG_ON(zero_page != NULL); zero_page = ZERO_PAGE(0); - page_cache_get(zero_page); + get_page(zero_page); /* * The number of active work items is limited by the number of @@ -1221,25 +1215,19 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) static void prepare_write_message_footer(struct ceph_connection *con) { struct ceph_msg *m = con->out_msg; - int v = con->out_kvec_left; m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec[v].iov_base = &m->footer; + con_out_kvec_add(con, sizeof_footer(con), &m->footer); if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) con->ops->sign_message(m); else m->footer.sig = 0; - con->out_kvec[v].iov_len = sizeof(m->footer); - con->out_kvec_bytes += sizeof(m->footer); } else { m->old_footer.flags = m->footer.flags; - con->out_kvec[v].iov_len = sizeof(m->old_footer); - con->out_kvec_bytes += sizeof(m->old_footer); } - con->out_kvec_left++; con->out_more = m->more_to_follow; con->out_msg_done = true; } @@ -1614,7 +1602,7 @@ static int write_partial_skip(struct ceph_connection *con) dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { - size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); + size_t size = min(con->out_skip, (int) PAGE_SIZE); ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (ret <= 0) @@ -2409,11 +2397,7 @@ static int read_partial_message(struct ceph_connection *con) } /* footer */ - if (need_sign) - size = sizeof(m->footer); - else - size = sizeof(m->old_footer); - + size = sizeof_footer(con); end += size; ret = read_partial(con, end, size, &m->footer); if (ret <= 0) @@ -3089,10 +3073,7 @@ void ceph_msg_revoke(struct ceph_msg *msg) con->out_skip += con_out_kvec_skip(con); } else { BUG_ON(!msg->data_length); - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) - con->out_skip += sizeof(msg->footer); - else - con->out_skip += sizeof(msg->old_footer); + con->out_skip += sizeof_footer(con); } /* data, middle, front */ if (msg->data_length) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index de85dddc3..cf638c009 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -122,51 +122,91 @@ static void __close_session(struct ceph_mon_client *monc) ceph_msg_revoke(monc->m_subscribe); ceph_msg_revoke_incoming(monc->m_subscribe_ack); ceph_con_close(&monc->con); - monc->cur_mon = -1; + monc->pending_auth = 0; ceph_auth_reset(monc->auth); } /* - * Open a session with a (new) monitor. + * Pick a new monitor at random and set cur_mon. If we are repicking + * (i.e. cur_mon is already set), be sure to pick a different one. */ -static int __open_session(struct ceph_mon_client *monc) +static void pick_new_mon(struct ceph_mon_client *monc) { - char r; - int ret; + int old_mon = monc->cur_mon; - if (monc->cur_mon < 0) { - get_random_bytes(&r, 1); - monc->cur_mon = r % monc->monmap->num_mon; - dout("open_session num=%d r=%d -> mon%d\n", - monc->monmap->num_mon, r, monc->cur_mon); - monc->sub_sent = 0; - monc->sub_renew_after = jiffies; /* i.e., expired */ - monc->want_next_osdmap = !!monc->want_next_osdmap; - - dout("open_session mon%d opening\n", monc->cur_mon); - ceph_con_open(&monc->con, - CEPH_ENTITY_TYPE_MON, monc->cur_mon, - &monc->monmap->mon_inst[monc->cur_mon].addr); - - /* send an initial keepalive to ensure our timestamp is - * valid by the time we are in an OPENED state */ - ceph_con_keepalive(&monc->con); - - /* initiatiate authentication handshake */ - ret = ceph_auth_build_hello(monc->auth, - monc->m_auth->front.iov_base, - monc->m_auth->front_alloc_len); - __send_prepared_auth_request(monc, ret); + BUG_ON(monc->monmap->num_mon < 1); + + if (monc->monmap->num_mon == 1) { + monc->cur_mon = 0; } else { - dout("open_session mon%d already open\n", monc->cur_mon); + int max = monc->monmap->num_mon; + int o = -1; + int n; + + if (monc->cur_mon >= 0) { + if (monc->cur_mon < monc->monmap->num_mon) + o = monc->cur_mon; + if (o >= 0) + max--; + } + + n = prandom_u32() % max; + if (o >= 0 && n >= o) + n++; + + monc->cur_mon = n; } - return 0; + + dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon, + monc->cur_mon, monc->monmap->num_mon); +} + +/* + * Open a session with a new monitor. + */ +static void __open_session(struct ceph_mon_client *monc) +{ + int ret; + + pick_new_mon(monc); + + monc->hunting = true; + if (monc->had_a_connection) { + monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF; + if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT) + monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT; + } + + monc->sub_renew_after = jiffies; /* i.e., expired */ + monc->sub_renew_sent = 0; + + dout("%s opening mon%d\n", __func__, monc->cur_mon); + ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon, + &monc->monmap->mon_inst[monc->cur_mon].addr); + + /* + * send an initial keepalive to ensure our timestamp is valid + * by the time we are in an OPENED state + */ + ceph_con_keepalive(&monc->con); + + /* initiate authentication handshake */ + ret = ceph_auth_build_hello(monc->auth, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + BUG_ON(ret <= 0); + __send_prepared_auth_request(monc, ret); } -static bool __sub_expired(struct ceph_mon_client *monc) +static void reopen_session(struct ceph_mon_client *monc) { - return time_after_eq(jiffies, monc->sub_renew_after); + if (!monc->hunting) + pr_info("mon%d %s session lost, hunting for new mon\n", + monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr)); + + __close_session(monc); + __open_session(monc); } /* @@ -174,74 +214,70 @@ static bool __sub_expired(struct ceph_mon_client *monc) */ static void __schedule_delayed(struct ceph_mon_client *monc) { - struct ceph_options *opt = monc->client->options; unsigned long delay; - if (monc->cur_mon < 0 || __sub_expired(monc)) { - delay = 10 * HZ; - } else { - delay = 20 * HZ; - if (opt->monc_ping_timeout > 0) - delay = min(delay, opt->monc_ping_timeout / 3); - } + if (monc->hunting) + delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult; + else + delay = CEPH_MONC_PING_INTERVAL; + dout("__schedule_delayed after %lu\n", delay); - schedule_delayed_work(&monc->delayed_work, - round_jiffies_relative(delay)); + mod_delayed_work(system_wq, &monc->delayed_work, + round_jiffies_relative(delay)); } +const char *ceph_sub_str[] = { + [CEPH_SUB_MDSMAP] = "mdsmap", + [CEPH_SUB_MONMAP] = "monmap", + [CEPH_SUB_OSDMAP] = "osdmap", +}; + /* - * Send subscribe request for mdsmap and/or osdmap. + * Send subscribe request for one or more maps, according to + * monc->subs. */ static void __send_subscribe(struct ceph_mon_client *monc) { - dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", - (unsigned int)monc->sub_sent, __sub_expired(monc), - monc->want_next_osdmap); - if ((__sub_expired(monc) && !monc->sub_sent) || - monc->want_next_osdmap == 1) { - struct ceph_msg *msg = monc->m_subscribe; - struct ceph_mon_subscribe_item *i; - void *p, *end; - int num; - - p = msg->front.iov_base; - end = p + msg->front_alloc_len; - - num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; - ceph_encode_32(&p, num); - - if (monc->want_next_osdmap) { - dout("__send_subscribe to 'osdmap' %u\n", - (unsigned int)monc->have_osdmap); - ceph_encode_string(&p, end, "osdmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_osdmap); - i->onetime = 1; - p += sizeof(*i); - monc->want_next_osdmap = 2; /* requested */ - } - if (monc->want_mdsmap) { - dout("__send_subscribe to 'mdsmap' %u+\n", - (unsigned int)monc->have_mdsmap); - ceph_encode_string(&p, end, "mdsmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_mdsmap); - i->onetime = 0; - p += sizeof(*i); - } - ceph_encode_string(&p, end, "monmap", 6); - i = p; - i->have = 0; - i->onetime = 0; - p += sizeof(*i); - - msg->front.iov_len = p - msg->front.iov_base; - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_msg_revoke(msg); - ceph_con_send(&monc->con, ceph_msg_get(msg)); - - monc->sub_sent = jiffies | 1; /* never 0 */ + struct ceph_msg *msg = monc->m_subscribe; + void *p = msg->front.iov_base; + void *const end = p + msg->front_alloc_len; + int num = 0; + int i; + + dout("%s sent %lu\n", __func__, monc->sub_renew_sent); + + BUG_ON(monc->cur_mon < 0); + + if (!monc->sub_renew_sent) + monc->sub_renew_sent = jiffies | 1; /* never 0 */ + + msg->hdr.version = cpu_to_le16(2); + + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { + if (monc->subs[i].want) + num++; } + BUG_ON(num < 1); /* monmap sub is always there */ + ceph_encode_32(&p, num); + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { + const char *s = ceph_sub_str[i]; + + if (!monc->subs[i].want) + continue; + + dout("%s %s start %llu flags 0x%x\n", __func__, s, + le64_to_cpu(monc->subs[i].item.start), + monc->subs[i].item.flags); + ceph_encode_string(&p, end, s, strlen(s)); + memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); + p += sizeof(monc->subs[i].item); + } + + BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_msg_revoke(msg); + ceph_con_send(&monc->con, ceph_msg_get(msg)); } static void handle_subscribe_ack(struct ceph_mon_client *monc, @@ -255,15 +291,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, seconds = le32_to_cpu(h->duration); mutex_lock(&monc->mutex); - if (monc->hunting) { - pr_info("mon%d %s session established\n", - monc->cur_mon, - ceph_pr_addr(&monc->con.peer_addr.in_addr)); - monc->hunting = false; + if (monc->sub_renew_sent) { + monc->sub_renew_after = monc->sub_renew_sent + + (seconds >> 1) * HZ - 1; + dout("%s sent %lu duration %d renew after %lu\n", __func__, + monc->sub_renew_sent, seconds, monc->sub_renew_after); + monc->sub_renew_sent = 0; + } else { + dout("%s sent %lu renew after %lu, ignoring\n", __func__, + monc->sub_renew_sent, monc->sub_renew_after); } - dout("handle_subscribe_ack after %d seconds\n", seconds); - monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; - monc->sub_sent = 0; mutex_unlock(&monc->mutex); return; bad: @@ -272,36 +309,82 @@ bad: } /* - * Keep track of which maps we have + * Register interest in a map + * + * @sub: one of CEPH_SUB_* + * @epoch: X for "every map since X", or 0 for "just the latest" */ -int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) +static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub, + u32 epoch, bool continuous) +{ + __le64 start = cpu_to_le64(epoch); + u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0; + + dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub], + epoch, continuous); + + if (monc->subs[sub].want && + monc->subs[sub].item.start == start && + monc->subs[sub].item.flags == flags) + return false; + + monc->subs[sub].item.start = start; + monc->subs[sub].item.flags = flags; + monc->subs[sub].want = true; + + return true; +} + +bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, + bool continuous) { + bool need_request; + mutex_lock(&monc->mutex); - monc->have_mdsmap = got; + need_request = __ceph_monc_want_map(monc, sub, epoch, continuous); mutex_unlock(&monc->mutex); - return 0; + + return need_request; } -EXPORT_SYMBOL(ceph_monc_got_mdsmap); +EXPORT_SYMBOL(ceph_monc_want_map); -int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) +/* + * Keep track of which maps we have + * + * @sub: one of CEPH_SUB_* + */ +static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub, + u32 epoch) +{ + dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch); + + if (monc->subs[sub].want) { + if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME) + monc->subs[sub].want = false; + else + monc->subs[sub].item.start = cpu_to_le64(epoch + 1); + } + + monc->subs[sub].have = epoch; +} + +void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) { mutex_lock(&monc->mutex); - monc->have_osdmap = got; - monc->want_next_osdmap = 0; + __ceph_monc_got_map(monc, sub, epoch); mutex_unlock(&monc->mutex); - return 0; } +EXPORT_SYMBOL(ceph_monc_got_map); /* * Register interest in the next osdmap */ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) { - dout("request_next_osdmap have %u\n", monc->have_osdmap); + dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); mutex_lock(&monc->mutex); - if (!monc->want_next_osdmap) - monc->want_next_osdmap = 1; - if (monc->want_next_osdmap < 2) + if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, + monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) __send_subscribe(monc); mutex_unlock(&monc->mutex); } @@ -320,15 +403,15 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, long ret; mutex_lock(&monc->mutex); - while (monc->have_osdmap < epoch) { + while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) { mutex_unlock(&monc->mutex); if (timeout && time_after_eq(jiffies, started + timeout)) return -ETIMEDOUT; ret = wait_event_interruptible_timeout(monc->client->auth_wq, - monc->have_osdmap >= epoch, - ceph_timeout_jiffies(timeout)); + monc->subs[CEPH_SUB_OSDMAP].have >= epoch, + ceph_timeout_jiffies(timeout)); if (ret < 0) return ret; @@ -341,11 +424,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, EXPORT_SYMBOL(ceph_monc_wait_osdmap); /* - * + * Open a session with a random monitor. Request monmap and osdmap, + * which are waited upon in __ceph_open_session(). */ int ceph_monc_open_session(struct ceph_mon_client *monc) { mutex_lock(&monc->mutex); + __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true); + __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false); __open_session(monc); __schedule_delayed(monc); mutex_unlock(&monc->mutex); @@ -353,29 +439,15 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_open_session); -/* - * We require the fsid and global_id in order to initialize our - * debugfs dir. - */ -static bool have_debugfs_info(struct ceph_mon_client *monc) -{ - dout("have_debugfs_info fsid %d globalid %lld\n", - (int)monc->client->have_fsid, monc->auth->global_id); - return monc->client->have_fsid && monc->auth->global_id > 0; -} - static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; void *p, *end; - int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); - had_debugfs_info = have_debugfs_info(monc); - dout("handle_monmap\n"); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -395,29 +467,11 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, client->monc.monmap = monmap; kfree(old); - if (!client->have_fsid) { - client->have_fsid = true; - if (!had_debugfs_info && have_debugfs_info(monc)) { - pr_info("client%lld fsid %pU\n", - ceph_client_id(monc->client), - &monc->client->fsid); - init_debugfs = 1; - } - mutex_unlock(&monc->mutex); - - if (init_debugfs) { - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(monc->client); - } + __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch); + client->have_fsid = true; - goto out_unlocked; - } out: mutex_unlock(&monc->mutex); -out_unlocked: wake_up_all(&client->auth_wq); } @@ -745,18 +799,15 @@ static void delayed_work(struct work_struct *work) dout("monc delayed_work\n"); mutex_lock(&monc->mutex); if (monc->hunting) { - __close_session(monc); - __open_session(monc); /* continue hunting */ + dout("%s continuing hunt\n", __func__); + reopen_session(monc); } else { - struct ceph_options *opt = monc->client->options; int is_auth = ceph_auth_is_authenticated(monc->auth); if (ceph_con_keepalive_expired(&monc->con, - opt->monc_ping_timeout)) { + CEPH_MONC_PING_TIMEOUT)) { dout("monc keepalive timeout\n"); is_auth = 0; - __close_session(monc); - monc->hunting = true; - __open_session(monc); + reopen_session(monc); } if (!monc->hunting) { @@ -764,8 +815,14 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); } - if (is_auth) - __send_subscribe(monc); + if (is_auth) { + unsigned long now = jiffies; + + dout("%s renew subs? now %lu renew after %lu\n", + __func__, now, monc->sub_renew_after); + if (time_after_eq(now, monc->sub_renew_after)) + __send_subscribe(monc); + } } __schedule_delayed(monc); mutex_unlock(&monc->mutex); @@ -852,18 +909,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) &monc->client->msgr); monc->cur_mon = -1; - monc->hunting = true; - monc->sub_renew_after = jiffies; - monc->sub_sent = 0; + monc->had_a_connection = false; + monc->hunt_mult = 1; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; monc->num_generic_requests = 0; monc->last_tid = 0; - monc->have_mdsmap = 0; - monc->have_osdmap = 0; - monc->want_next_osdmap = 1; return 0; out_auth_reply: @@ -888,7 +941,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc) mutex_lock(&monc->mutex); __close_session(monc); - + monc->cur_mon = -1; mutex_unlock(&monc->mutex); /* @@ -910,26 +963,40 @@ void ceph_monc_stop(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_stop); +static void finish_hunting(struct ceph_mon_client *monc) +{ + if (monc->hunting) { + dout("%s found mon%d\n", __func__, monc->cur_mon); + monc->hunting = false; + monc->had_a_connection = true; + monc->hunt_mult /= 2; /* reduce by 50% */ + if (monc->hunt_mult < 1) + monc->hunt_mult = 1; + } +} + static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { int ret; int was_auth = 0; - int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); - had_debugfs_info = have_debugfs_info(monc); was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, monc->m_auth->front.iov_base, monc->m_auth->front_alloc_len); + if (ret > 0) { + __send_prepared_auth_request(monc, ret); + goto out; + } + + finish_hunting(monc); + if (ret < 0) { monc->client->auth_err = ret; - wake_up_all(&monc->client->auth_wq); - } else if (ret > 0) { - __send_prepared_auth_request(monc, ret); } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); @@ -939,23 +1006,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __send_subscribe(monc); __resend_generic_request(monc); - } - if (!had_debugfs_info && have_debugfs_info(monc)) { - pr_info("client%lld fsid %pU\n", - ceph_client_id(monc->client), - &monc->client->fsid); - init_debugfs = 1; + pr_info("mon%d %s session established\n", monc->cur_mon, + ceph_pr_addr(&monc->con.peer_addr.in_addr)); } - mutex_unlock(&monc->mutex); - if (init_debugfs) { - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(monc->client); - } +out: + mutex_unlock(&monc->mutex); + if (monc->client->auth_err < 0) + wake_up_all(&monc->client->auth_wq); } static int __validate_auth(struct ceph_mon_client *monc) @@ -1096,29 +1155,17 @@ static void mon_fault(struct ceph_connection *con) { struct ceph_mon_client *monc = con->private; - if (!monc) - return; - - dout("mon_fault\n"); mutex_lock(&monc->mutex); - if (!con->private) - goto out; - - if (!monc->hunting) - pr_info("mon%d %s session lost, " - "hunting for new mon\n", monc->cur_mon, - ceph_pr_addr(&monc->con.peer_addr.in_addr)); - - __close_session(monc); - if (!monc->hunting) { - /* start hunting */ - monc->hunting = true; - __open_session(monc); - } else { - /* already hunting, let's wait a bit */ - __schedule_delayed(monc); + dout("%s mon%d\n", __func__, monc->cur_mon); + if (monc->cur_mon >= 0) { + if (!monc->hunting) { + dout("%s hunting for new mon\n", __func__); + reopen_session(monc); + __schedule_delayed(monc); + } else { + dout("%s already hunting\n", __func__); + } } -out: mutex_unlock(&monc->mutex); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5bc053778..40a53a70e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref) ceph_put_snap_context(req->r_snapc); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); - else + else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) kmem_cache_free(ceph_osd_request_cache, req); - + else + kfree(req); } void ceph_osdc_get_request(struct ceph_osd_request *req) @@ -369,28 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; - BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); - BUG_ON(num_ops > CEPH_OSD_MAX_OP); - - msg_size = 4 + 4 + 8 + 8 + 4+8; - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ - msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ - msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); - msg_size += 8; /* snapid */ - msg_size += 8; /* snap_seq */ - msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ - msg_size += 4; - if (use_mempool) { + BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); req = mempool_alloc(osdc->req_mempool, gfp_flags); - memset(req, 0, sizeof(*req)); + } else if (num_ops <= CEPH_OSD_SLAB_OPS) { + req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); } else { - req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); + BUG_ON(num_ops > CEPH_OSD_MAX_OPS); + req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]), + gfp_flags); } - if (req == NULL) + if (unlikely(!req)) return NULL; + /* req only, each op is zeroed in _osd_req_op_init() */ + memset(req, 0, sizeof(*req)); + req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; @@ -408,18 +403,36 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_base_oloc.pool = -1; req->r_target_oloc.pool = -1; + msg_size = OSD_OPREPLY_FRONT_LEN; + if (num_ops > CEPH_OSD_SLAB_OPS) { + /* ceph_osd_op and rval */ + msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * + (sizeof(struct ceph_osd_op) + 4); + } + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, gfp_flags, true); + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, + gfp_flags, true); if (!msg) { ceph_osdc_put_request(req); return NULL; } req->r_reply = msg; + msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ + msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += 1 + 8 + 4 + 4; /* pgid */ + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ + msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); + msg_size += 8; /* snapid */ + msg_size += 8; /* snap_seq */ + msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4; /* retry_attempt */ + /* create request message; allow space for oid */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); @@ -498,7 +511,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; - op->payload_len = payload_len; + op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_extent_init); @@ -517,10 +530,32 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, BUG_ON(length > previous); op->extent.length = length; - op->payload_len -= previous - length; + op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); +void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc) +{ + struct ceph_osd_req_op *op, *prev_op; + + BUG_ON(which + 1 >= osd_req->r_num_ops); + + prev_op = &osd_req->r_ops[which]; + op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); + /* dup previous one */ + op->indata_len = prev_op->indata_len; + op->outdata_len = prev_op->outdata_len; + op->extent = prev_op->extent; + /* adjust offset */ + op->extent.offset += offset_inc; + op->extent.length -= offset_inc; + + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) + op->indata_len -= offset_inc; +} +EXPORT_SYMBOL(osd_req_op_extent_dup_last); + void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { @@ -554,7 +589,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, op->cls.argc = 0; /* currently unused */ - op->payload_len = payload_len; + op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -587,7 +622,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, op->xattr.cmp_mode = cmp_mode; ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); - op->payload_len = payload_len; + op->indata_len = payload_len; return 0; } EXPORT_SYMBOL(osd_req_op_xattr_init); @@ -707,7 +742,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); dst->cls.indata_len = cpu_to_le32(data_length); ceph_osdc_msg_data_add(req->r_request, osd_data); - src->payload_len += data_length; + src->indata_len += data_length; request_data_len += data_length; } osd_data = &src->cls.response_data; @@ -750,7 +785,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->op = cpu_to_le16(src->op); dst->flags = cpu_to_le32(src->flags); - dst->payload_len = cpu_to_le32(src->payload_len); + dst->payload_len = cpu_to_le32(src->indata_len); return request_data_len; } @@ -1052,10 +1087,8 @@ static void put_osd(struct ceph_osd *osd) dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); if (atomic_dec_and_test(&osd->o_ref)) { - struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (osd->o_auth.authorizer) - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(osd->o_auth.authorizer); kfree(osd); } } @@ -1810,7 +1843,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_decode_need(&p, end, 4, bad_put); numops = ceph_decode_32(&p); - if (numops > CEPH_OSD_MAX_OP) + if (numops > CEPH_OSD_MAX_OPS) goto bad_put; if (numops != req->r_num_ops) goto bad_put; @@ -1821,7 +1854,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) int len; len = le32_to_cpu(op->payload_len); - req->r_reply_op_len[i] = len; + req->r_ops[i].outdata_len = len; dout(" op %d has %d bytes\n", i, len); payload_len += len; p += sizeof(*op); @@ -1836,7 +1869,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_decode_need(&p, end, 4 + numops * 4, bad_put); retry_attempt = ceph_decode_32(&p); for (i = 0; i < numops; i++) - req->r_reply_op_result[i] = ceph_decode_32(&p); + req->r_ops[i].rval = ceph_decode_32(&p); if (le16_to_cpu(msg->hdr.version) >= 6) { p += 8 + 4; /* skip replay_version */ @@ -2187,7 +2220,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) goto bad; done: downgrade_write(&osdc->map_sem); - ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, + osdc->osdmap->epoch); /* * subscribe to subsequent osdmap updates if full to ensure @@ -2646,8 +2680,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) round_jiffies_relative(osdc->client->options->osd_idle_ttl)); err = -ENOMEM; - osdc->req_mempool = mempool_create_kmalloc_pool(10, - sizeof(struct ceph_osd_request)); + osdc->req_mempool = mempool_create_slab_pool(10, + ceph_osd_request_cache); if (!osdc->req_mempool) goto out; @@ -2782,11 +2816,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages); int ceph_osdc_setup(void) { + size_t size = sizeof(struct ceph_osd_request) + + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); + BUG_ON(ceph_osd_request_cache); - ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", - sizeof (struct ceph_osd_request), - __alignof__(struct ceph_osd_request), - 0, NULL); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, + 0, 0, NULL); return ceph_osd_request_cache ? 0 : -ENOMEM; } @@ -2947,7 +2982,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(auth->authorizer); auth->authorizer = NULL; } if (!auth->authorizer) { diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index c7c220a73..6864007e6 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -56,7 +56,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) size_t bit = pl->room; int ret; - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK), buf, bit); pl->length += bit; pl->room -= bit; @@ -67,7 +67,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) return ret; } - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK), buf, len); pl->length += len; pl->room -= len; return 0; diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d4f5f220a..00d260140 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, return ERR_PTR(-ENOMEM); while (got < num_pages) { - rc = get_user_pages_unlocked(current, current->mm, + rc = get_user_pages_unlocked( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), num_pages - got, write_page, 0, pages + got); if (rc < 0) @@ -95,19 +95,19 @@ int ceph_copy_user_to_page_vector(struct page **pages, loff_t off, size_t len) { int i = 0; - int po = off & ~PAGE_CACHE_MASK; + int po = off & ~PAGE_MASK; int left = len; int l, bad; while (left > 0) { - l = min_t(int, PAGE_CACHE_SIZE-po, left); + l = min_t(int, PAGE_SIZE-po, left); bad = copy_from_user(page_address(pages[i]) + po, data, l); if (bad == l) return -EFAULT; data += l - bad; left -= l - bad; po += l - bad; - if (po == PAGE_CACHE_SIZE) { + if (po == PAGE_SIZE) { po = 0; i++; } @@ -121,17 +121,17 @@ void ceph_copy_to_page_vector(struct page **pages, loff_t off, size_t len) { int i = 0; - size_t po = off & ~PAGE_CACHE_MASK; + size_t po = off & ~PAGE_MASK; size_t left = len; while (left > 0) { - size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + size_t l = min_t(size_t, PAGE_SIZE-po, left); memcpy(page_address(pages[i]) + po, data, l); data += l; left -= l; po += l; - if (po == PAGE_CACHE_SIZE) { + if (po == PAGE_SIZE) { po = 0; i++; } @@ -144,17 +144,17 @@ void ceph_copy_from_page_vector(struct page **pages, loff_t off, size_t len) { int i = 0; - size_t po = off & ~PAGE_CACHE_MASK; + size_t po = off & ~PAGE_MASK; size_t left = len; while (left > 0) { - size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + size_t l = min_t(size_t, PAGE_SIZE-po, left); memcpy(data, page_address(pages[i]) + po, l); data += l; left -= l; po += l; - if (po == PAGE_CACHE_SIZE) { + if (po == PAGE_SIZE) { po = 0; i++; } @@ -168,25 +168,25 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector); */ void ceph_zero_page_vector_range(int off, int len, struct page **pages) { - int i = off >> PAGE_CACHE_SHIFT; + int i = off >> PAGE_SHIFT; - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; dout("zero_page_vector_page %u~%u\n", off, len); /* leading partial page? */ if (off) { - int end = min((int)PAGE_CACHE_SIZE, off + len); + int end = min((int)PAGE_SIZE, off + len); dout("zeroing %d %p head from %d\n", i, pages[i], (int)off); zero_user_segment(pages[i], off, end); len -= (end - off); i++; } - while (len >= PAGE_CACHE_SIZE) { + while (len >= PAGE_SIZE) { dout("zeroing %d %p len=%d\n", i, pages[i], len); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; + zero_user_segment(pages[i], 0, PAGE_SIZE); + len -= PAGE_SIZE; i++; } /* trailing partial page? */ diff --git a/net/core/Makefile b/net/core/Makefile index 0b835de04..d6508c2dd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,3 +24,6 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o +obj-$(CONFIG_HWBM) += hwbm.o +obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/dev.c b/net/core/dev.c index 0ef061b2b..5c925ac50 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2802,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_CSUM_MASK; + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h) trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); } + + __kfree_skb_flush(); } if (sd->output_queue) { @@ -4154,7 +4160,10 @@ ncls: ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); + if (!deliver_exact) + atomic_long_inc(&skb->dev->rx_dropped); + else + atomic_long_inc(&skb->dev->rx_nohandler); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -4429,7 +4438,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->udp_mark = 0; + NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ @@ -5152,6 +5162,7 @@ static void net_rx_action(struct softirq_action *h) } } + __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -6435,6 +6446,7 @@ EXPORT_SYMBOL(dev_get_phys_port_id); * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name + * @len: limit of bytes to copy to name * * Get device physical port name */ @@ -7253,24 +7265,31 @@ void netdev_run_todo(void) } } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { #if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); memcpy(stats64, netdev_stats, sizeof(*stats64)); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + sizeof(*netdev_stats), 0, + sizeof(*stats64) - sizeof(*netdev_stats)); #else - size_t i, n = sizeof(*stats64) / sizeof(u64); + size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); const unsigned long *src = (const unsigned long *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = src[i]; + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); #endif } EXPORT_SYMBOL(netdev_stats_to_stats64); @@ -7300,6 +7319,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); storage->tx_dropped += atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); diff --git a/net/core/devlink.c b/net/core/devlink.c new file mode 100644 index 000000000..590fa561c --- /dev/null +++ b/net/core/devlink.c @@ -0,0 +1,738 @@ +/* + * net/core/devlink.c - Network physical/parent device Netlink interface + * + * Heavily inspired by net/wireless/ + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/device.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <rdma/ib_verbs.h> +#include <net/netlink.h> +#include <net/genetlink.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/devlink.h> + +static LIST_HEAD(devlink_list); + +/* devlink_mutex + * + * An overall lock guarding every operation coming from userspace. + * It also guards devlink devices list and it is taken when + * driver registers/unregisters it. + */ +static DEFINE_MUTEX(devlink_mutex); + +/* devlink_port_mutex + * + * Shared lock to guard lists of ports in all devlink devices. + */ +static DEFINE_MUTEX(devlink_port_mutex); + +static struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} + +static void devlink_net_set(struct devlink *devlink, struct net *net) +{ + write_pnet(&devlink->_net, net); +} + +static struct devlink *devlink_get_from_attrs(struct net *net, + struct nlattr **attrs) +{ + struct devlink *devlink; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + list_for_each_entry(devlink, &devlink_list, list) { + if (strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0 && + net_eq(devlink_net(devlink), net)) + return devlink; + } + + return ERR_PTR(-ENODEV); +} + +static struct devlink *devlink_get_from_info(struct genl_info *info) +{ + return devlink_get_from_attrs(genl_info_net(info), info->attrs); +} + +static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, + int port_index) +{ + struct devlink_port *devlink_port; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (devlink_port->index == port_index) + return devlink_port; + } + return NULL; +} + +static bool devlink_port_index_exists(struct devlink *devlink, int port_index) +{ + return devlink_port_get_by_index(devlink, port_index); +} + +static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) + return ERR_PTR(-ENODEV); + return devlink_port; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_port_get_from_attrs(devlink, info->attrs); +} + +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) + +static int devlink_nl_pre_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_info(info); + if (IS_ERR(devlink)) { + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink); + } + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + struct devlink_port *devlink_port; + + mutex_lock(&devlink_port_mutex); + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink_port); + } + info->user_ptr[1] = devlink_port; + } + return 0; +} + +static void devlink_nl_post_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); +} + +static struct genl_family devlink_nl_family = { + .id = GENL_ID_GENERATE, + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, +}; + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) +{ + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + return -EMSGSIZE; + return 0; +} + +static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) + goto nla_put_failure; + if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && + nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, + devlink_port->desired_type)) + goto nla_put_failure; + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net_device *netdev = devlink_port->type_dev; + + if (netdev && + (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, + netdev->ifindex) || + nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, + netdev->name))) + goto nla_put_failure; + } + if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { + struct ib_device *ibdev = devlink_port->type_dev; + + if (ibdev && + nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, + ibdev->name)) + goto nla_put_failure; + } + if (devlink_port->split && + nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + devlink_port->split_group)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_port->devlink; + struct sk_buff *msg; + int err; + + if (!devlink_port->registered) + return; + + WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + goto out; + idx++; + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_PORT_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_port *devlink_port; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) + goto out; + idx++; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_port_type_set(struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_port_type port_type) + +{ + int err; + + if (devlink->ops && devlink->ops->port_type_set) { + if (port_type == DEVLINK_PORT_TYPE_NOTSET) + return -EINVAL; + err = devlink->ops->port_type_set(devlink_port, port_type); + if (err) + return err; + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; + } + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; + int err; + + if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { + enum devlink_port_type port_type; + + port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); + err = devlink_port_type_set(devlink, devlink_port, port_type); + if (err) + return err; + } + return 0; +} + +static int devlink_port_split(struct devlink *devlink, + u32 port_index, u32 count) + +{ + if (devlink->ops && devlink->ops->port_split) + return devlink->ops->port_split(devlink, port_index, count); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + u32 count; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] || + !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + return devlink_port_split(devlink, port_index, count); +} + +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) + +{ + if (devlink->ops && devlink->ops->port_unsplit) + return devlink->ops->port_unsplit(devlink, port_index); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + return devlink_port_unsplit(devlink, port_index); +} + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, +}; + +static const struct genl_ops devlink_nl_ops[] = { + { + .cmd = DEVLINK_CMD_GET, + .doit = devlink_nl_cmd_get_doit, + .dumpit = devlink_nl_cmd_get_dumpit, + .policy = devlink_nl_policy, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .doit = devlink_nl_cmd_port_get_doit, + .dumpit = devlink_nl_cmd_port_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_SET, + .doit = devlink_nl_cmd_port_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_SPLIT, + .doit = devlink_nl_cmd_port_split_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .doit = devlink_nl_cmd_port_unsplit_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +/** + * devlink_alloc - Allocate new devlink instance resources + * + * @ops: ops + * @priv_size: size of user private data + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +{ + struct devlink *devlink; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + devlink->ops = ops; + devlink_net_set(devlink, &init_net); + INIT_LIST_HEAD(&devlink->port_list); + return devlink; +} +EXPORT_SYMBOL_GPL(devlink_alloc); + +/** + * devlink_register - Register devlink instance + * + * @devlink: devlink + */ +int devlink_register(struct devlink *devlink, struct device *dev) +{ + mutex_lock(&devlink_mutex); + devlink->dev = dev; + list_add_tail(&devlink->list, &devlink_list); + devlink_notify(devlink, DEVLINK_CMD_NEW); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devlink_unregister - Unregister devlink instance + * + * @devlink: devlink + */ +void devlink_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink_notify(devlink, DEVLINK_CMD_DEL); + list_del(&devlink->list); + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + kfree(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +/** + * devlink_port_register - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + */ +int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + mutex_lock(&devlink_port_mutex); + if (devlink_port_index_exists(devlink, port_index)) { + mutex_unlock(&devlink_port_mutex); + return -EEXIST; + } + devlink_port->devlink = devlink; + devlink_port->index = port_index; + devlink_port->type = DEVLINK_PORT_TYPE_NOTSET; + devlink_port->registered = true; + list_add_tail(&devlink_port->list, &devlink->port_list); + mutex_unlock(&devlink_port_mutex); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_register); + +/** + * devlink_port_unregister - Unregister devlink port + * + * @devlink_port: devlink port + */ +void devlink_port_unregister(struct devlink_port *devlink_port) +{ + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + mutex_lock(&devlink_port_mutex); + list_del(&devlink_port->list); + mutex_unlock(&devlink_port_mutex); +} +EXPORT_SYMBOL_GPL(devlink_port_unregister); + +static void __devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type type, + void *type_dev) +{ + devlink_port->type = type; + devlink_port->type_dev = type_dev; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * @netdev: related netdevice + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_ETH, netdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); + +/** + * devlink_port_type_ib_set - Set port type to InfiniBand + * + * @devlink_port: devlink port + * @ibdev: related IB device + */ +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_IB, ibdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); + +/** + * devlink_port_type_clear - Clear port type + * + * @devlink_port: devlink port + */ +void devlink_port_type_clear(struct devlink_port *devlink_port) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_NOTSET, NULL); +} +EXPORT_SYMBOL_GPL(devlink_port_type_clear); + +/** + * devlink_port_split_set - Set port is split + * + * @devlink_port: devlink port + * @split_group: split group - identifies group split port is part of + */ +void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group) +{ + devlink_port->split = true; + devlink_port->split_group = split_group; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_split_set); + +static int __init devlink_module_init(void) +{ + return genl_register_family_with_ops_groups(&devlink_nl_family, + devlink_nl_ops, + devlink_nl_mcgrps); +} + +static void __exit devlink_module_exit(void) +{ + genl_unregister_family(&devlink_nl_family); +} + +module_init(devlink_module_init); +module_exit(devlink_module_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); +MODULE_DESCRIPTION("Network physical device Netlink interface"); +MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME); diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3b8..b5cbbe07f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000..554d36449 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,168 @@ +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index daf04709d..f426c5ad6 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", }; static const char @@ -386,43 +387,461 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } -int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { + bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); + dst[0] = legacy_u32; +} + +/* return false if src had higher bits set. lower bits always updated. */ +static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32, + const unsigned long *src) +{ + bool retval = true; + + /* TODO: following test will soon always be true */ + if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); + + bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_fill(ext, 32); + bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + if (bitmap_intersects(ext, src, + __ETHTOOL_LINK_MODE_MASK_NBITS)) { + /* src mask goes beyond bit 31 */ + retval = false; + } + } + *legacy_u32 = src[0]; + return retval; +} + +/* return false if legacy contained non-0 deprecated fields + * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +static bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->transceiver || + legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +/* return false if ksettings link modes had higher bits + * set. legacy_settings always updated (best effort) + */ +static bool +convert_link_ksettings_to_legacy_settings( + struct ethtool_cmd *legacy_settings, + const struct ethtool_link_ksettings *link_ksettings) +{ + bool retval = true; + + memset(legacy_settings, 0, sizeof(*legacy_settings)); + /* this also clears the deprecated fields in legacy structure: + * __u8 transceiver; + * __u32 maxtxpkt; + * __u32 maxrxpkt; + */ + + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->supported, + link_ksettings->link_modes.supported); + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->advertising, + link_ksettings->link_modes.advertising); + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->lp_advertising, + link_ksettings->link_modes.lp_advertising); + ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); + legacy_settings->duplex + = link_ksettings->base.duplex; + legacy_settings->port + = link_ksettings->base.port; + legacy_settings->phy_address + = link_ksettings->base.phy_address; + legacy_settings->autoneg + = link_ksettings->base.autoneg; + legacy_settings->mdio_support + = link_ksettings->base.mdio_support; + legacy_settings->eth_tp_mdix + = link_ksettings->base.eth_tp_mdix; + legacy_settings->eth_tp_mdix_ctrl + = link_ksettings->base.eth_tp_mdix_ctrl; + return retval; +} + +/* number of 32-bit words to store the user's link mode bitmaps */ +#define __ETHTOOL_LINK_MODE_MASK_NU32 \ + DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) + +/* layout of the struct passed from/to userland */ +struct ethtool_link_usettings { + struct ethtool_link_settings base; + struct { + __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + } link_modes; +}; + +/* Internal kernel helper to query a device ethtool_link_settings. + * + * Backward compatibility note: for compatibility with legacy drivers + * that implement only the ethtool_cmd API, this has to work with both + * drivers implementing get_link_ksettings API and drivers + * implementing get_settings API. When drivers implement get_settings + * and report ethtool_cmd deprecated fields + * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored + * because the resulting struct ethtool_link_settings does not report them. + */ +int __ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *link_ksettings) +{ + int err; + struct ethtool_cmd cmd; + ASSERT_RTNL(); + if (dev->ethtool_ops->get_link_ksettings) { + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, + link_ksettings); + } + + /* driver doesn't support %ethtool_link_ksettings API. revert to + * legacy %ethtool_cmd API, unless it's not supported either. + * TODO: remove when ethtool_ops::get_settings disappears internally + */ if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt + */ + convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); + return err; } -EXPORT_SYMBOL(__ethtool_get_settings); +EXPORT_SYMBOL(__ethtool_get_link_ksettings); -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +/* convert ethtool_link_usettings in user space to a kernel internal + * ethtool_link_ksettings. return 0 on success, errno on error. + */ +static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, + const void __user *from) { - int err; - struct ethtool_cmd cmd; + struct ethtool_link_usettings link_usettings; + + if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) + return -EFAULT; + + memcpy(&to->base, &link_usettings.base, sizeof(to->base)); + bitmap_from_u32array(to->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + + return 0; +} + +/* convert a kernel internal ethtool_link_ksettings to + * ethtool_link_usettings in user space. return 0 on success, errno on + * error. + */ +static int +store_link_ksettings_for_user(void __user *to, + const struct ethtool_link_ksettings *from) +{ + struct ethtool_link_usettings link_usettings; + + memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); + bitmap_to_u32array(link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + + if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) + return -EFAULT; + + return 0; +} + +/* Query device for its ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::get_link_ksettings, even if legacy + * ethtool_ops::get_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_GSET for + * this driver, so that they can correctly access the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_get_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err = 0; + struct ethtool_link_ksettings link_ksettings; - err = __ethtool_get_settings(dev, &cmd); + ASSERT_RTNL(); + + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + /* handle bitmap nbits handshake */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) { + /* wrong link mode nbits requested */ + memset(&link_ksettings, 0, sizeof(link_ksettings)); + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + /* send back number of words required as negative val */ + compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, + "need too many bits for link modes!"); + link_ksettings.base.link_mode_masks_nwords + = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); + + /* copy the base fields back to user, not the link + * mode bitmaps + */ + if (copy_to_user(useraddr, &link_ksettings.base, + sizeof(link_ksettings.base))) + return -EFAULT; + + return 0; + } + + /* handshake successful: user/kernel agree on + * link_mode_masks_nwords + */ + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; + /* make sure we tell the right values to user */ + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + + return store_link_ksettings_for_user(useraddr, &link_ksettings); +} + +/* Update device ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::set_link_ksettings, even if legacy + * ethtool_ops::set_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_SSET for + * this driver, so that they can correctly update the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_set_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err; + struct ethtool_link_ksettings link_ksettings; + + ASSERT_RTNL(); + + if (!dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + /* make sure nbits field has expected value */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + /* copy the whole structure, now that we know it has expected + * format + */ + err = load_link_ksettings_from_user(&link_ksettings, useraddr); + if (err) + return err; + + /* re-check nwords field, just in case */ + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); +} + +static void +warn_incomplete_ethtool_legacy_settings_conversion(const char *details) +{ + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", + get_task_comm(name, current), details); +} + +/* Query device for its ethtool_cmd settings. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing get_link_ksettings + * API and drivers implementing get_settings API. When drivers + * implement get_link_ksettings and report higher link mode bits, a + * kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful + * (only the lower link mode bits reported back to user). + */ +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + ASSERT_RTNL(); + + if (dev->ethtool_ops->get_link_ksettings) { + /* First, use link_ksettings API if it is supported */ + int err; + struct ethtool_link_ksettings link_ksettings; + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, + &link_ksettings); + if (err < 0) + return err; + if (!convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings)) + warn_incomplete_ethtool_legacy_settings_conversion( + "link modes are only partially reported"); + + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; + } else { + /* driver doesn't support %ethtool_link_ksettings + * API. revert to legacy %ethtool_cmd API, unless it's + * not supported either. + */ + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + } + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; + return 0; } +/* Update device link settings with given ethtool_cmd. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing set_link_ksettings + * API and drivers implementing set_settings API. When drivers + * implement set_link_ksettings and user's request updates deprecated + * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, and the request is rejected. + */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd; - if (!dev->ethtool_ops->set_settings) - return -EOPNOTSUPP; + ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; + /* first, try new %ethtool_link_ksettings API. */ + if (dev->ethtool_ops->set_link_ksettings) { + struct ethtool_link_ksettings link_ksettings; + + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, + &cmd)) + return -EINVAL; + + link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, + &link_ksettings); + } + + /* legacy %ethtool_cmd API */ + + /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings + * disappears internally + */ + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + return dev->ethtool_ops->set_settings(dev, &cmd); } @@ -632,7 +1051,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } -u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { @@ -642,6 +1061,37 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -738,6 +1188,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); @@ -897,6 +1355,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rss_config); @@ -1227,14 +1693,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels; + struct ethtool_channels channels, max; + u32 max_rx_in_use = 0; - if (!dev->ethtool_ops->set_channels) + if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; + dev->ethtool_ops->get_channels(dev, &max); + + /* ensure new counts are within the maximums */ + if ((channels.rx_count > max.max_rx) || + (channels.tx_count > max.max_tx) || + (channels.combined_count > max.max_combined) || + (channels.other_count > max.max_other)) + return -EINVAL; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } @@ -1823,13 +2306,121 @@ out: return ret; } +static int ethtool_get_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int ret; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if (!dev->ethtool_ops->get_per_queue_coalesce) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + return ret; + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + useraddr += sizeof(coalesce); + } + + return 0; +} + +static int ethtool_set_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int i, ret = 0; + int n_queue; + struct ethtool_coalesce *backup = NULL, *tmp = NULL; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if ((!dev->ethtool_ops->set_per_queue_coalesce) || + (!dev->ethtool_ops->get_per_queue_coalesce)) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); + tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); + if (ret != 0) + goto roll_back; + + tmp++; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { + ret = -EFAULT; + goto roll_back; + } + + ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + goto roll_back; + + useraddr += sizeof(coalesce); + } + +roll_back: + if (ret != 0) { + tmp = backup; + for_each_set_bit(i, queue_mask, bit) { + dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); + tmp++; + } + } + kfree(backup); + + return ret; +} + +static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_per_queue_op per_queue_opt; + + if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) + return -EFAULT; + + switch (per_queue_opt.sub_command) { + case ETHTOOL_GCOALESCE: + return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); + case ETHTOOL_SCOALESCE: + return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); + default: + return -EOPNOTSUPP; + }; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; - u32 ethcmd; + u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; @@ -1839,8 +2430,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (ethcmd == ETHTOOL_PERQUEUE) { + if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) + return -EFAULT; + } else { + sub_cmd = ethcmd; + } /* Allow some commands to be done by anyone */ - switch (ethcmd) { + switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: @@ -2070,6 +2667,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; + case ETHTOOL_PERQUEUE: + rc = ethtool_set_per_queue(dev, useraddr); + break; + case ETHTOOL_GLINKSETTINGS: + rc = ethtool_get_link_ksettings(dev, useraddr); + break; + case ETHTOOL_SLINKSETTINGS: + rc = ethtool_set_link_ksettings(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index fb2951c35..ca7f832b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -530,12 +530,14 @@ do_pass: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - /* RET_K, RET_A are remaped into 2 insns. */ + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: - *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? - BPF_K : BPF_X, BPF_REG_0, - BPF_REG_A, fp->k); + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); *insn = BPF_EXIT_INSN(); break; @@ -1180,7 +1182,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) if (bpf_prog_size(prog->len) > sysctl_optmem_max) return -ENOMEM; - if (sk_unhashed(sk)) { + if (sk_unhashed(sk) && sk->sk_reuseport) { err = reuseport_alloc(sk); if (err) return err; @@ -1338,18 +1340,25 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_LDST_LEN 16U +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); struct sk_buff *skb = (struct sk_buff *) (long) r1; int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[BPF_LDST_LEN]; void *ptr; - if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; /* bpf verifier guarantees that: @@ -1360,14 +1369,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + len))) + if (unlikely(skb_try_make_writable(skb, offset + len))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, len, buf); + ptr = skb_header_pointer(skb, offset, len, sp->buff); if (unlikely(!ptr)) return -EFAULT; @@ -1376,17 +1383,19 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) memcpy(ptr, from, len); - if (ptr == buf) + if (ptr == sp->buff) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); return 0; } -const struct bpf_func_proto bpf_skb_store_bytes_proto = { +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1405,7 +1414,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK)) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, to); @@ -1417,7 +1426,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_load_bytes_proto = { +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1437,9 +1446,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1447,6 +1454,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EFAULT; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; case 2: csum_replace2(ptr, from, to); break; @@ -1464,7 +1477,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l3_csum_replace_proto = { +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1479,23 +1492,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); if (unlikely(!ptr)) return -EFAULT; + if (is_mmzero && !*ptr) + return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1506,6 +1527,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; if (ptr == &sum) /* skb_store_bits guaranteed to not return -EFAULT here */ skb_store_bits(skb, offset, ptr, sizeof(sum)); @@ -1513,7 +1536,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l4_csum_replace_proto = { +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1524,6 +1547,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u64 diff_size = from_size + to_size; + __be32 *from = (__be32 *) (long) r1; + __be32 *to = (__be32 *) (long) r3; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +static const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; @@ -1548,11 +1610,10 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) } skb2->dev = dev; - skb_sender_cpu_clear(skb2); return dev_queue_xmit(skb2); } -const struct bpf_func_proto bpf_clone_redirect_proto = { +static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1601,11 +1662,10 @@ int skb_do_redirect(struct sk_buff *skb) } skb->dev = dev; - skb_sender_cpu_clear(skb); return dev_queue_xmit(skb); } -const struct bpf_func_proto bpf_redirect_proto = { +static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1627,14 +1687,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { -#ifdef CONFIG_IP_ROUTE_CLASSID - const struct dst_entry *dst; - - dst = skb_dst((struct sk_buff *) (unsigned long) r1); - if (dst) - return dst->tclassid; -#endif - return 0; + return dst_tclassid((struct sk_buff *) (unsigned long) r1); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1687,6 +1740,13 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_vlan_pop) return true; + if (func == bpf_skb_store_bytes) + return true; + if (func == bpf_l3_csum_replace) + return true; + if (func == bpf_l4_csum_replace) + return true; + return false; } @@ -1708,12 +1768,16 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EPROTO; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; +set_compat: to = (struct bpf_tunnel_key *)compat; break; default: @@ -1725,11 +1789,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - if (flags & BPF_F_TUNINFO_IPV6) + if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); - else + to->tunnel_label = be32_to_cpu(info->key.label); + } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy((void *)(long) r2, to, size); @@ -1737,7 +1803,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1747,6 +1813,32 @@ const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) + return -ENOENT; + if (unlikely(size < info->options_len)) + return -ENOMEM; + + ip_tunnel_info_opts_get(to, info); + + return info->options_len; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1757,10 +1849,13 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1773,6 +1868,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) + return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1781,7 +1879,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; @@ -1790,6 +1891,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); if (flags & BPF_F_ZERO_CSUM_TX) @@ -1799,7 +1902,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1809,17 +1912,53 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > IP_TUNNEL_OPTS_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1856,6 +1995,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -1871,7 +2012,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: @@ -1920,16 +2065,14 @@ static bool sk_filter_is_valid_access(int off, int size, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type) { - if (off == offsetof(struct __sk_buff, tc_classid)) - return type == BPF_WRITE ? true : false; - if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): + case offsetof(struct __sk_buff, tc_classid): break; default: return false; @@ -2046,8 +2189,10 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, tc_classid); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); - WARN_ON(type != BPF_WRITE); - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + else + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); break; case offsetof(struct __sk_buff, tc_index): diff --git a/net/core/flow.c b/net/core/flow.c index 1033725be..3937b1b68 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -92,8 +92,11 @@ static void flow_cache_gc_task(struct work_struct *work) list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); spin_unlock_bh(&xfrm->flow_cache_gc_lock); - list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) { flow_entry_kill(fce, xfrm); + atomic_dec(&xfrm->flow_cache_gc_count); + WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0); + } } static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, @@ -101,6 +104,7 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, struct netns_xfrm *xfrm) { if (deleted) { + atomic_add(deleted, &xfrm->flow_cache_gc_count); fcp->hash_count -= deleted; spin_lock_bh(&xfrm->flow_cache_gc_lock); list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); @@ -232,6 +236,13 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, if (fcp->hash_count > fc->high_watermark) flow_cache_shrink(fc, fcp); + if (fcp->hash_count > 2 * fc->high_watermark || + atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) { + atomic_inc(&net->xfrm.flow_cache_genid); + flo = ERR_PTR(-ENOBUFS); + goto ret_object; + } + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); if (fle) { fle->net = net; @@ -446,6 +457,7 @@ int flow_cache_init(struct net *net) INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); mutex_init(&net->xfrm.flow_flush_sem); + atomic_set(&net->xfrm.flow_cache_gc_count, 0); fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 12e700332..a669dea14 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,25 +19,12 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -static bool dissector_uses_key(const struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) -{ - return flow_dissector->used_keys & (1 << key_id); -} - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } -static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id, - void *target_container) -{ - return ((char *) target_container) + flow_dissector->offset[key_id]; -} - void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) @@ -178,15 +165,16 @@ ip: ip_proto = iph->protocol; - if (!dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) - break; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -219,13 +207,12 @@ ipv6: if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { - struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; - - key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, - target_container); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); - memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + memcpy(&key_addrs->v6addrs, &iph->saddr, + sizeof(key_addrs->v6addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -339,8 +326,11 @@ mpls: } case htons(ETH_P_FCOE): - key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); - /* fall through */ + if ((hlen - nhoff) < FCOE_HEADER_LEN) + goto out_bad; + + nhoff += FCOE_HEADER_LEN; + goto out_good; default: goto out_bad; } @@ -447,13 +437,12 @@ ip_proto_again: key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; - if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { - ip_proto = fh->nexthdr; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) goto ip_proto_again; - } } goto out_good; } @@ -740,6 +729,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, { u32 poff = keys->control.thoff; + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 92d886f4a..4573d8109 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -191,6 +191,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats /** * gen_new_estimator - create a new rate estimator * @bstats: basic statistics + * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock * @opt: rate estimator configuration TLV @@ -287,6 +288,7 @@ EXPORT_SYMBOL(gen_kill_estimator); /** * gen_replace_estimator - replace rate estimator configuration * @bstats: basic statistics + * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock * @opt: rate estimator configuration TLV diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1e2f46a69..e640462ea 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -140,6 +140,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle + * @cpu: copy statistic per cpu * @b: basic statistics * * Appends the basic statistics to the top level TLV created by diff --git a/net/core/hwbm.c b/net/core/hwbm.c new file mode 100644 index 000000000..941c28486 --- /dev/null +++ b/net/core/hwbm.c @@ -0,0 +1,87 @@ +/* Support for hardware buffer manager. + * + * Copyright (C) 2016 Marvell + * + * Gregory CLEMENT <gregory.clement@free-electrons.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/skbuff.h> +#include <net/hwbm.h> + +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) +{ + if (likely(bm_pool->frag_size <= PAGE_SIZE)) + skb_free_frag(buf); + else + kfree(buf); +} +EXPORT_SYMBOL_GPL(hwbm_buf_free); + +/* Refill processing for HW buffer management */ +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) +{ + int frag_size = bm_pool->frag_size; + void *buf; + + if (likely(frag_size <= PAGE_SIZE)) + buf = netdev_alloc_frag(frag_size); + else + buf = kmalloc(frag_size, gfp); + + if (!buf) + return -ENOMEM; + + if (bm_pool->construct) + if (bm_pool->construct(bm_pool, buf)) { + hwbm_buf_free(bm_pool, buf); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(hwbm_pool_refill); + +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +{ + int err, i; + unsigned long flags; + + spin_lock_irqsave(&bm_pool->lock, flags); + if (bm_pool->buf_num == bm_pool->size) { + pr_warn("pool already filled\n"); + return bm_pool->buf_num; + } + + if (buf_num + bm_pool->buf_num > bm_pool->size) { + pr_warn("cannot allocate %d buffers for pool\n", + buf_num); + return 0; + } + + if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { + pr_warn("Adding %d buffers to the %d current buffers will overflow\n", + buf_num, bm_pool->buf_num); + return 0; + } + + for (i = 0; i < buf_num; i++) { + err = hwbm_pool_refill(bm_pool, gfp); + if (err < 0) + break; + } + + /* Update BM driver with number of buffers added to pool */ + bm_pool->buf_num += i; + + pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); + spin_unlock_irqrestore(&bm_pool->lock, flags); + + return i; +} +EXPORT_SYMBOL_GPL(hwbm_pool_add); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 299cfc24d..669ecc9f8 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -27,6 +27,31 @@ #include <net/rtnetlink.h> #include <net/ip6_fib.h> +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; @@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + rcu_read_unlock(); + request_module("rtnl-lwt-%s", encap_type_str); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + } + } +#endif if (likely(ops && ops->build_state)) ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b6c8a6629..2b3f76fe6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -29,7 +29,6 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; -static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; - switch (cmd.duplex) { + + switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; @@ -574,6 +576,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] = { &dev_attr_rx_packets.attr, @@ -599,6 +602,7 @@ static struct attribute *netstat_attrs[] = { &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, NULL }; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0260c84ed..11fce1727 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -9,7 +9,6 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f1efbc39e..2ec86fc55 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1474cfd2d..20999aa59 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2856,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); @@ -2983,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 215e6137f..65763c29f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; + + a->rx_nohandler = b->rx_nohandler; } static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) @@ -893,6 +895,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */ + + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1176,14 +1180,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) return -EMSGSIZE; @@ -1222,6 +1228,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1388,15 +1396,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, -}; - -static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { - [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 }, - [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 }, - [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 }, - [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 }, - [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 }, - [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 }, + [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, + [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1413,6 +1414,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +{ + const struct rtnl_link_ops *ops = NULL; + struct nlattr *linfo[IFLA_INFO_MAX + 1]; + + if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0) + return NULL; + + if (linfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } + + return ops; +} + +static bool link_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool link_kind_filtered(const struct net_device *dev, + const struct rtnl_link_ops *kind_ops) +{ + if (kind_ops && dev->rtnl_link_ops != kind_ops) + return true; + + return false; +} + +static bool link_dump_filtered(struct net_device *dev, + int master_idx, + const struct rtnl_link_ops *kind_ops) +{ + if (link_master_filtered(dev, master_idx) || + link_kind_filtered(dev, kind_ops)) + return true; + + return false; +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -1422,6 +1475,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; + const struct rtnl_link_ops *kind_ops = NULL; + unsigned int flags = NLM_F_MULTI; + int master_idx = 0; int err; int hdrlen; @@ -1444,18 +1500,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + if (tb[IFLA_MASTER]) + master_idx = nla_get_u32(tb[IFLA_MASTER]); + + if (tb[IFLA_LINKINFO]) + kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); + + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, + flags, ext_filter_mask); /* If we ran out of room on the first message, * we're in trouble @@ -1535,6 +1602,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return 0; } +static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, + int guid_type) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); +} + +static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) +{ + if (dev->type != ARPHRD_INFINIBAND) + return -EOPNOTSUPP; + + return handle_infiniband_guid(dev, ivt, guid_type); +} + static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1637,6 +1720,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } + if (tb[IFLA_VF_IB_NODE_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); + } + + if (tb[IFLA_VF_IB_PORT_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); + } + return err; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8616d1147..e561f9f07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -349,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +#define NAPI_SKB_CACHE_SIZE 64 + +struct napi_alloc_cache { + struct page_frag_cache page; + size_t skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -380,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(nc, fragsz, gfp_mask); + return __alloc_page_frag(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -476,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; @@ -496,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(nc, len, gfp_mask); + data = __alloc_page_frag(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -507,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->pfmemalloc) + if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -749,6 +757,73 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +void __kfree_skb_flush(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* flush skb_cache if containing objects */ + if (nc->skb_count) { + kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, + nc->skb_cache); + nc->skb_count = 0; + } +} + +static inline void _kfree_skb_defer(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* drop skb->head and call any destructors for packet */ + skb_release_all(skb); + + /* record skb to CPU local list */ + nc->skb_cache[nc->skb_count++] = skb; + +#ifdef CONFIG_SLUB + /* SLUB writes into objects when freeing */ + prefetchw(skb); +#endif + + /* flush skb_cache if it is filled */ + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, + nc->skb_cache); + nc->skb_count = 0; + } +} +void __kfree_skb_defer(struct sk_buff *skb) +{ + _kfree_skb_defer(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + if (unlikely(!skb)) + return; + + /* Zero budget indicate non-NAPI context called us, like netpoll */ + if (unlikely(!budget)) { + dev_consume_skb_any(skb); + return; + } + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + _kfree_skb_defer(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + /* Make sure a field is enclosed inside headers_start/headers_end section */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ @@ -1843,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, struct splice_pipe_desc *spd, struct sock *sk) { int seg; + struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -1869,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return true; } + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + return true; + } + return false; } @@ -1895,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk, /* * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. + * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, @@ -1916,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *frag_iter; int ret = 0; - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) - goto done; - else if (!tlen) - goto done; - - /* - * now see if we have a frag_list to map - */ - skb_walk_frags(skb, frag_iter) { - if (!tlen) - break; - if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) - break; - } + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); -done: if (spd.nr_pages) ret = splice_cb(sk, pipe, &spd); @@ -3024,8 +3092,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !head_skb->encap_hdr_csum && - !!can_checksum_protocol(features, proto); + csum = !!can_checksum_protocol(features, proto); headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3118,13 +3185,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg && !nskb->remcsum_offload) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); + if (!sg) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, len), + len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; continue; } @@ -3190,12 +3259,19 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum && !nskb->remcsum_offload) { - nskb->csum = skb_checksum(nskb, doffset, - nskb->len - doffset, 0); - nskb->ip_summed = CHECKSUM_NONE; + if (!csum) { + if (skb_has_shared_frag(nskb)) { + err = __skb_linearize(nskb); + if (err) + goto err; + } + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); @@ -4237,7 +4313,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - skb_sender_cpu_clear(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); @@ -4427,15 +4502,16 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) __skb_push(skb, offset); err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); - if (err) + if (err) { + __skb_pull(skb, offset); return err; + } + skb->protocol = skb->vlan_proto; skb->mac_len += VLAN_HLEN; - __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + __skb_pull(skb, offset); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 6c1c8bc93..7e73c26b6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -221,7 +221,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , + "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -237,7 +238,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , + "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -253,7 +255,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , + "clock-AF_MAX" }; /* @@ -987,6 +990,10 @@ set_rcvbuf: sk->sk_incoming_cpu = val; break; + case SO_CNX_ADVICE: + if (val == 1) + dst_negative_advice(sk); + break; default: ret = -ENOPROTOOPT; break; @@ -1531,6 +1538,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk = NULL; goto out; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; newsk->sk_priority = 0; @@ -1903,7 +1911,7 @@ EXPORT_SYMBOL(sock_cmsg_send); bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { - if (atomic_read(&pfrag->page->_count) == 1) { + if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8be8f27bf..9c67a961b 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -800,7 +800,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) } lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, + sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index b8608b71a..4663a01d5 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -668,7 +668,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); if (!sk) { @@ -993,7 +993,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 607a14f20..b1dc096d2 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1034,10 +1034,13 @@ source_ok: if (!fld.daddr) { fld.daddr = fld.saddr; - err = -EADDRNOTAVAIL; if (dev_out) dev_put(dev_out); + err = -EINVAL; dev_out = init_net.loopback_dev; + if (!dev_out->dn_ptr) + goto out; + err = -EADDRNOTAVAIL; dev_hold(dev_out); if (!fld.daddr) { fld.daddr = @@ -1110,6 +1113,8 @@ source_ok: if (dev_out == NULL) goto out; dn_db = rcu_dereference_raw(dev_out->dn_ptr); + if (!dn_db) + goto e_inval; /* Possible improvement - check all devices for local addr */ if (dn_dev_islocal(dev_out, fld.daddr)) { dev_put(dev_out); @@ -1151,6 +1156,8 @@ select_source: dev_put(dev_out); dev_out = init_net.loopback_dev; dev_hold(dev_out); + if (!dev_out->dn_ptr) + goto e_inval; fld.flowidn_oif = dev_out->ifindex; if (res.fi) dn_fib_info_put(res.fi); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index d8fb47fca..c28c47463 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -430,35 +430,30 @@ static void dsa_switch_destroy(struct dsa_switch *ds) hwmon_device_unregister(ds->hwmon_dev); #endif - /* Disable configuration of the CPU and DSA ports */ + /* Destroy network devices for physical switch ports. */ for (port = 0; port < DSA_MAX_PORTS; port++) { - if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + if (!(ds->phys_port_mask & (1 << port))) + continue; + + if (!ds->ports[port]) continue; + dsa_slave_destroy(ds->ports[port]); + } + + /* Remove any fixed link PHYs */ + for (port = 0; port < DSA_MAX_PORTS; port++) { port_dn = cd->port_dn[port]; if (of_phy_is_fixed_link(port_dn)) { phydev = of_phy_find_device(port_dn); if (phydev) { - int addr = phydev->mdio.addr; - phy_device_free(phydev); of_node_put(port_dn); - fixed_phy_del(addr); + fixed_phy_unregister(phydev); } } } - /* Destroy network devices for physical switch ports. */ - for (port = 0; port < DSA_MAX_PORTS; port++) { - if (!(ds->phys_port_mask & (1 << port))) - continue; - - if (!ds->ports[port]) - continue; - - dsa_slave_destroy(ds->ports[port]); - } - mdiobus_unregister(ds->slave_mii_bus); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ab24521be..a575f0350 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -201,47 +201,6 @@ out: return 0; } -static int dsa_bridge_check_vlan_range(struct dsa_switch *ds, - const struct net_device *bridge, - u16 vid_begin, u16 vid_end) -{ - struct dsa_slave_priv *p; - struct net_device *dev, *vlan_br; - DECLARE_BITMAP(members, DSA_MAX_PORTS); - DECLARE_BITMAP(untagged, DSA_MAX_PORTS); - u16 vid; - int member, err; - - if (!ds->drv->vlan_getnext || !vid_begin) - return -EOPNOTSUPP; - - vid = vid_begin - 1; - - do { - err = ds->drv->vlan_getnext(ds, &vid, members, untagged); - if (err) - break; - - if (vid > vid_end) - break; - - member = find_first_bit(members, DSA_MAX_PORTS); - if (member == DSA_MAX_PORTS) - continue; - - dev = ds->ports[member]; - p = netdev_priv(dev); - vlan_br = p->bridge_dev; - if (vlan_br == bridge) - continue; - - netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid); - return -EOPNOTSUPP; - } while (vid < vid_end); - - return err == -ENOENT ? 0 : err; -} - static int dsa_slave_port_vlan_add(struct net_device *dev, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) @@ -254,15 +213,6 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) return -EOPNOTSUPP; - /* If the requested port doesn't belong to the same bridge as - * the VLAN members, fallback to software VLAN (hopefully). - */ - err = dsa_bridge_check_vlan_range(ds, p->bridge_dev, - vlan->vid_begin, - vlan->vid_end); - if (err) - return err; - err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); if (err) return err; @@ -293,41 +243,11 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - DECLARE_BITMAP(members, DSA_MAX_PORTS); - DECLARE_BITMAP(untagged, DSA_MAX_PORTS); - u16 pvid, vid = 0; - int err; - if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get) - return -EOPNOTSUPP; + if (ds->drv->port_vlan_dump) + return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); - err = ds->drv->port_pvid_get(ds, p->port, &pvid); - if (err) - return err; - - for (;;) { - err = ds->drv->vlan_getnext(ds, &vid, members, untagged); - if (err) - break; - - if (!test_bit(p->port, members)) - continue; - - memset(vlan, 0, sizeof(*vlan)); - vlan->vid_begin = vlan->vid_end = vid; - - if (vid == pvid) - vlan->flags |= BRIDGE_VLAN_INFO_PVID; - - if (test_bit(p->port, untagged)) - vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED; - - err = cb(&vlan->obj); - if (err) - break; - } - - return err == -ENOENT ? 0 : err; + return -EOPNOTSUPP; } static int dsa_slave_port_fdb_add(struct net_device *dev, @@ -385,31 +305,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -/* Return a bitmask of all ports being currently bridged within a given bridge - * device. Note that on leave, the mask will still return the bitmask of ports - * currently bridged, prior to port removal, and this is exactly what we want. - */ -static u32 dsa_slave_br_port_mask(struct dsa_switch *ds, - struct net_device *bridge) -{ - struct dsa_slave_priv *p; - unsigned int port; - u32 mask = 0; - - for (port = 0; port < DSA_MAX_PORTS; port++) { - if (!dsa_is_port_initialized(ds, port)) - continue; - - p = netdev_priv(ds->ports[port]); - - if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT && - p->bridge_dev == bridge) - mask |= 1 << port; - } - - return mask; -} - static int dsa_slave_stp_update(struct net_device *dev, u8 state) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -422,6 +317,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) return ret; } +static int dsa_slave_vlan_filtering(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + if (ds->drv->port_vlan_filtering) + return ds->drv->port_vlan_filtering(ds, p->port, + attr->u.vlan_filtering); + + return 0; +} + static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -438,6 +351,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = ds->drv->port_stp_update(ds, p->port, attr->u.stp_state); break; + case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: + ret = dsa_slave_vlan_filtering(dev, attr, trans); + break; default: ret = -EOPNOTSUPP; break; @@ -532,23 +448,20 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, p->bridge_dev = br; - if (ds->drv->port_join_bridge) - ret = ds->drv->port_join_bridge(ds, p->port, - dsa_slave_br_port_mask(ds, br)); + if (ds->drv->port_bridge_join) + ret = ds->drv->port_bridge_join(ds, p->port, br); - return ret; + return ret == -EOPNOTSUPP ? 0 : ret; } -static int dsa_slave_bridge_port_leave(struct net_device *dev) +static void dsa_slave_bridge_port_leave(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; - if (ds->drv->port_leave_bridge) - ret = ds->drv->port_leave_bridge(ds, p->port, - dsa_slave_br_port_mask(ds, p->bridge_dev)); + if (ds->drv->port_bridge_leave) + ds->drv->port_bridge_leave(ds, p->port); p->bridge_dev = NULL; @@ -556,8 +469,6 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev) * so allow it to be in BR_STATE_FORWARDING to be kept functional */ dsa_slave_stp_update(dev, BR_STATE_FORWARDING); - - return ret; } static int dsa_slave_port_attr_get(struct net_device *dev, @@ -982,11 +893,15 @@ static void dsa_slave_adjust_link(struct net_device *dev) static int dsa_slave_fixed_link_update(struct net_device *dev, struct fixed_phy_status *status) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_slave_priv *p; + struct dsa_switch *ds; - if (ds->drv->fixed_link_update) - ds->drv->fixed_link_update(ds, p->port, status); + if (dev) { + p = netdev_priv(dev); + ds = p->parent; + if (ds->drv->fixed_link_update) + ds->drv->fixed_link_update(ds, p->port, status); + } return 0; } @@ -1228,40 +1143,46 @@ static bool dsa_slave_dev_check(struct net_device *dev) return dev->netdev_ops == &dsa_slave_netdev_ops; } -static int dsa_slave_master_changed(struct net_device *dev) +static int dsa_slave_port_upper_event(struct net_device *dev, + unsigned long event, void *ptr) { - struct net_device *master = netdev_master_upper_dev_get(dev); - struct dsa_slave_priv *p = netdev_priv(dev); + struct netdev_notifier_changeupper_info *info = ptr; + struct net_device *upper = info->upper_dev; int err = 0; - if (master && master->rtnl_link_ops && - !strcmp(master->rtnl_link_ops->kind, "bridge")) - err = dsa_slave_bridge_port_join(dev, master); - else if (dsa_port_is_bridged(p)) - err = dsa_slave_bridge_port_leave(dev); + switch (event) { + case NETDEV_CHANGEUPPER: + if (netif_is_bridge_master(upper)) { + if (info->linking) + err = dsa_slave_bridge_port_join(dev, upper); + else + dsa_slave_bridge_port_leave(dev); + } - return err; + break; + } + + return notifier_from_errno(err); } -int dsa_slave_netdevice_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static int dsa_slave_port_event(struct net_device *dev, unsigned long event, + void *ptr) { - struct net_device *dev; - int err = 0; - switch (event) { case NETDEV_CHANGEUPPER: - dev = netdev_notifier_info_to_dev(ptr); - if (!dsa_slave_dev_check(dev)) - goto out; + return dsa_slave_port_upper_event(dev, event, ptr); + } - err = dsa_slave_master_changed(dev); - if (err && err != -EOPNOTSUPP) - netdev_warn(dev, "failed to reflect master change\n"); + return NOTIFY_DONE; +} - break; - } +int dsa_slave_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dsa_slave_dev_check(dev)) + return dsa_slave_port_event(dev, event, ptr); -out: return NOTIFY_DONE; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 103871784..66dff5e3d 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL(eth_header); */ u32 eth_get_headlen(void *data, unsigned int len) { + const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; struct flow_keys keys; @@ -134,7 +135,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len, 0)) + sizeof(*eth), len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 737c87a2a..0023c9048 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -207,7 +207,7 @@ static int lowpan_device_event(struct notifier_block *unused, struct net_device *wdev = netdev_notifier_info_to_dev(ptr); if (wdev->type != ARPHRD_IEEE802154) - goto out; + return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: @@ -219,11 +219,10 @@ static int lowpan_device_event(struct notifier_block *unused, lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL); break; default: - break; + return NOTIFY_DONE; } -out: - return NOTIFY_DONE; + return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a548be247..e0bd013a1 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); -static void raw_hash(struct sock *sk) +static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&raw_lock); + + return 0; } static void raw_unhash(struct sock *sk) @@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk) return container_of(sk, struct dgram_sock, sk); } -static void dgram_hash(struct sock *sk) +static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&dgram_lock); + + return 0; } static void dgram_unhash(struct sock *sk) @@ -1026,8 +1030,13 @@ static int ieee802154_create(struct net *net, struct socket *sock, /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); - if (sk->sk_prot->hash) - sk->sk_prot->hash(sk); + if (sk->sk_prot->hash) { + rc = sk->sk_prot->hash(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7ccd693db..eb51c43c2 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate + select DST_CACHE default n config NET_IPGRE @@ -405,14 +406,6 @@ config INET_XFRM_MODE_BEET If unsure, say Y. -config INET_LRO - tristate "Large Receive Offload (ipv4/tcp)" - default y - ---help--- - Support for Large Receive Offload (ipv4/tcp). - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 62c049b64..bfa133691 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o -obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5c5db6636..9e481992d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -370,7 +370,11 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { @@ -1091,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p) } EXPORT_SYMBOL(inet_unregister_protosw); -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; - static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1127,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (new_saddr == old_saddr) return 0; - if (sysctl_ip_dynaddr > 1) { + if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } @@ -1142,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __sk_prot_rehash(sk); - return 0; + return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) @@ -1183,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk) * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ - if (!sysctl_ip_dynaddr || + if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) @@ -1383,6 +1380,45 @@ out: return pp; } +static struct sk_buff **ipip_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + if (NAPI_GRO_CB(skb)->encap_mark) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + NAPI_GRO_CB(skb)->encap_mark = 1; + + return inet_gro_receive(head, skb); +} + +#define SECONDS_PER_DAY 86400 + +/* inet_current_timestamp - Return IP network timestamp + * + * Return milliseconds since midnight in network byte order. + */ +__be32 inet_current_timestamp(void) +{ + u32 secs; + u32 msecs; + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + + /* Get secs since midnight. */ + (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); + /* Convert to msecs. */ + msecs = secs * MSEC_PER_SEC; + /* Convert nsec to msec. */ + msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; + + /* Convert to network byte order. */ + return htonl(msecs); +} +EXPORT_SYMBOL(inet_current_timestamp); + int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { if (sk->sk_family == AF_INET) @@ -1425,6 +1461,13 @@ out_unlock: return err; } +static int ipip_gro_complete(struct sk_buff *skb, int nhoff) +{ + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP; + return inet_gro_complete(skb, nhoff); +} + int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net) @@ -1652,8 +1695,8 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { .gso_segment = inet_gso_segment, - .gro_receive = inet_gro_receive, - .gro_complete = inet_gro_complete, + .gro_receive = ipip_gro_receive, + .gro_complete = ipip_gro_complete, }, }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59b3e0e8f..c34c7544d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!in_dev) - goto out; + goto out_free_skb; arp = arp_hdr(skb); @@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) - goto out; + goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: @@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) - goto out; + goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) - goto out; + goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) - goto out; + goto out_free_skb; break; } @@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) - goto out; + goto out_free_skb; /* * Extract fields @@ -733,7 +733,15 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) - goto out; + goto out_free_skb; + + /* + * For some 802.11 wireless deployments (and possibly other networks), + * there will be an ARP proxy and gratuitous ARP frames are attacks + * and thus should not be accepted. + */ + if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) + goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address @@ -770,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); - goto out; + goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && @@ -795,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } } - goto out; + goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || @@ -818,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) in_dev->arp_parms, skb); goto out_free_dst; } - goto out; + goto out_consume_skb; } } } @@ -868,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } -out: +out_consume_skb: consume_skb(skb); + out_free_dst: dst_release(reply_dst); - return 0; + return NET_RX_SUCCESS; + +out_free_skb: + kfree_skb(skb); + return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) @@ -916,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, consumeskb: consume_skb(skb); - return 0; + return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: - return 0; + return NET_RX_DROP; } /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0212591b0..e333bc86b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1198,6 +1198,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; struct in_device *in_dev; struct net *net = dev_net(dev); + int master_idx; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1218,12 +1219,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (addr) goto out_unlock; no_in_dev: + master_idx = l3mdev_master_ifindex_rcu(dev); + + /* For VRFs, the VRF device takes the place of the loopback device, + * with addresses on it being preferred. Note in such cases the + * loopback device will be among the devices that fail the master_idx + * equality check in the loop below. + */ + if (master_idx && + (dev = dev_get_by_index_rcu(net, master_idx)) && + (in_dev = __in_dev_get_rcu(dev))) { + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock; + } + } endfor_ifa(in_dev); + } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; @@ -1735,17 +1757,20 @@ static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ + bool all = false; + + if (type == NETCONFA_ALL) + all = true; - /* type -1 is used for ALL */ - if (type == -1 || type == NETCONFA_FORWARDING) + if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_RP_FILTER) + if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_MC_FORWARDING) + if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_PROXY_NEIGH) + if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; @@ -1758,36 +1783,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, { struct nlmsghdr *nlh; struct netconfmsg *ncm; + bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; + if (type == NETCONFA_ALL) + all = true; + ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; - /* type -1 is used for ALL */ - if ((type == -1 || type == NETCONFA_FORWARDING) && + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF(*devconf, FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_RP_FILTER) && + if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; @@ -1875,14 +1903,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, - -1); + NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -1926,7 +1954,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) { + NETCONFA_ALL) < 0) { rcu_read_unlock(); goto done; } @@ -1942,7 +1970,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -1953,7 +1981,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -2189,6 +2217,8 @@ static struct devinet_sysctl_table { "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), + DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, + "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2196,6 +2226,8 @@ static struct devinet_sysctl_table { "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), + DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, + "drop_unicast_in_l2_multicast"), }, }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8a9246dec..63566ec54 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -904,7 +904,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { - pr_warn("%s: bug: prim == NULL\n", __func__); + /* if the device has been deleted, we don't perform + * address promotion + */ + if (!in_dev->dead) + pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d97268e8f..2b68418c7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; fi->fib_metrics[type - 1] = val; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 976f0dcf6..a6962ccad 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -48,7 +48,7 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static void fou_recv_pull(struct sk_buff *skb, size_t len) +static int fou_recv_pull(struct sk_buff *skb, size_t len) { struct iphdr *iph = ip_hdr(skb); @@ -59,6 +59,7 @@ static void fou_recv_pull(struct sk_buff *skb, size_t len) __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); + return iptunnel_pull_offloads(skb); } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -68,9 +69,14 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - fou_recv_pull(skb, sizeof(struct udphdr)); + if (fou_recv_pull(skb, sizeof(struct udphdr))) + goto drop; return -fou->protocol; + +drop: + kfree_skb(skb); + return 0; } static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, @@ -170,6 +176,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); + if (iptunnel_pull_offloads(skb)) + goto drop; + return -guehdr->proto_ctype; drop: @@ -186,6 +195,17 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head, u8 proto = NAPI_GRO_CB(skb)->proto; const struct net_offload **offloads; + /* We can clear the encap_mark for FOU as we are essentially doing + * one of two possible things. We are either adding an L4 tunnel + * header to the outer L3 tunnel header, or we are are simply + * treating the GRE tunnel header as though it is a UDP protocol + * specific header such as VXLAN or GENEVE. + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + + /* Flag this frame as already having an outer encap header */ + NAPI_GRO_CB(skb)->is_fou = 1; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -208,8 +228,6 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff, int err = -ENOSYS; const struct net_offload **offloads; - udp_tunnel_gro_complete(skb, nhoff); - rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -218,6 +236,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff, err = ops->callbacks.gro_complete(skb, nhoff); + skb_set_inner_mac_header(skb, nhoff); + out_unlock: rcu_read_unlock(); @@ -319,8 +339,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, skb_gro_pull(skb, hdrlen); - flush = 0; - for (p = *head; p; p = p->next) { const struct guehdr *guehdr2; @@ -345,6 +363,17 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } + /* We can clear the encap_mark for GUE as we are essentially doing + * one of two possible things. We are either adding an L4 tunnel + * header to the outer L3 tunnel header, or we are are simply + * treating the GRE tunnel header as though it is a UDP protocol + * specific header such as VXLAN or GENEVE. + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + + /* Flag this frame as already having an outer encap header */ + NAPI_GRO_CB(skb)->is_fou = 1; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); @@ -352,6 +381,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); @@ -384,6 +414,8 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff, err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + skb_set_inner_mac_header(skb, nhoff + guehlen); + out_unlock: rcu_read_unlock(); return err; @@ -774,7 +806,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); - uh->check = 0; udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); @@ -784,11 +815,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; __be16 sport; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -804,8 +835,8 @@ EXPORT_SYMBOL(fou_build_header); int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; __be16 sport; @@ -814,7 +845,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { - csum = false; optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; @@ -822,7 +852,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, optlen += need_priv ? GUE_LEN_PRIV : 0; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5a8ee3282..6a5bd4317 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -18,15 +18,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); - netdev_features_t enc_features; - int ghl; - struct gre_base_hdr *greh; u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; __be16 protocol = skb->protocol; - int tnl_hlen; - bool csum; + u16 mac_len = skb->mac_len; + int gre_offset, outer_hlen; + bool need_csum, ufo; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -43,74 +41,75 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, if (!skb->encapsulation) goto out; - if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; - greh = (struct gre_base_hdr *)skb_transport_header(skb); - - ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); - if (unlikely(ghl < sizeof(*greh))) + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; - csum = !!(greh->flags & GRE_CSUM); - if (csum) - skb->encap_hdr_csum = 1; - /* setup inner skb. */ - skb->protocol = greh->protocol; skb->encapsulation = 0; - - if (unlikely(!pskb_may_pull(skb, ghl))) - goto out; - - __skb_pull(skb, ghl); + SKB_GSO_CB(skb)->encap_level = 0; + __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = skb->inner_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + skb->encap_hdr_csum = need_csum; + + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags based + * on the fact that we will be computing our checksum in software. + */ + if (ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = skb_mac_gso_segment(skb, enc_features); + segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { - skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; } + outer_hlen = skb_tnl_header_len(skb); + gre_offset = outer_hlen - tnl_hlen; skb = segs; - tnl_hlen = skb_tnl_header_len(skb); do { - __skb_push(skb, ghl); - if (csum) { - __be32 *pcsum; - - if (skb_has_shared_frag(skb)) { - int err; - - err = __skb_linearize(skb); - if (err) { - kfree_skb_list(segs); - segs = ERR_PTR(err); - goto out; - } - } + struct gre_base_hdr *greh; + __be32 *pcsum; - skb_reset_transport_header(skb); - - greh = (struct gre_base_hdr *) - skb_transport_header(skb); - pcsum = (__be32 *)(greh + 1); - *pcsum = 0; - *(__sum16 *)pcsum = gso_make_checksum(skb, 0); + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; } - __skb_push(skb, tnl_hlen - ghl); - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + skb->mac_len = mac_len; + skb->protocol = protocol; + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); - skb->mac_len = mac_len; - skb->protocol = protocol; + skb_set_transport_header(skb, gre_offset); + + if (!need_csum) + continue; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + pcsum = (__be32 *)(greh + 1); + + *pcsum = 0; + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } while ((skb = skb->next)); out: return segs; @@ -128,6 +127,11 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, struct packet_offload *ptype; __be16 type; + if (NAPI_GRO_CB(skb)->encap_mark) + goto out; + + NAPI_GRO_CB(skb)->encap_mark = 1; + off = skb_gro_offset(skb); hlen = off + sizeof(*greh); greh = skb_gro_header_fast(skb, off); @@ -146,6 +150,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) goto out; + /* We can only support GRE_CSUM if we can track the location of + * the GRE header. In the case of FOU/GUE we cannot because the + * outer UDP header displaces the GRE header leaving us in a state + * of limbo. + */ + if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou) + goto out; + type = greh->protocol; rcu_read_lock(); @@ -177,8 +189,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - flush = 0; - for (p = *head; p; p = p->next) { const struct gre_base_hdr *greh2; @@ -215,6 +225,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, greh, grehlen); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36e26977c..633348977 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb) */ static bool icmp_timestamp(struct sk_buff *skb) { - struct timespec tv; struct icmp_bxm icmp_param; /* * Too short. @@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - getnstimeofday(&tv); - icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + - tv.tv_nsec / NSEC_PER_MSEC); + icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b3086cf27..9b4ca87f7 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -107,12 +107,6 @@ #include <linux/seq_file.h> #endif -#define IP_MAX_MEMBERSHIPS 20 -#define IP_MAX_MSF 10 - -/* IGMP reports for link-local multicast groups are enabled by default */ -int sysctl_igmp_llm_reports __read_mostly = 1; - #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -432,6 +426,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; + struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -439,7 +434,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; - if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || @@ -542,6 +537,7 @@ empty_source: static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; + struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { @@ -550,7 +546,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -686,7 +682,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -765,9 +761,10 @@ static void igmp_ifc_timer_expire(unsigned long data) static void igmp_ifc_event(struct in_device *in_dev) { + struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -857,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; + struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return false; rcu_read_lock(); @@ -886,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, __be32 group = ih->group; int max_delay; int mark = 0; + struct net *net = dev_net(in_dev->dev); if (len == 8) { @@ -971,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&im->lock); if (im->tm_running) @@ -1087,6 +1086,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc; + struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not @@ -1101,7 +1101,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1186,6 +1186,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); int reporter; #endif @@ -1197,7 +1198,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; reporter = im->reporter; @@ -1222,6 +1223,9 @@ static void igmp_group_dropped(struct ip_mc_list *im) static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif if (im->loaded == 0) { im->loaded = 1; @@ -1231,7 +1235,7 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; if (in_dev->dead) @@ -1244,7 +1248,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1313,6 +1317,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev, void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { struct ip_mc_list *im; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); @@ -1339,7 +1346,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = sysctl_igmp_qrv; + im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1532,6 +1539,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; + struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); @@ -1539,7 +1547,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; /* a failover is happening and switches @@ -1638,6 +1646,9 @@ void ip_mc_down(struct in_device *in_dev) void ip_mc_init_dev(struct in_device *in_dev) { +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST @@ -1645,7 +1656,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1656,11 +1667,14 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); @@ -1726,11 +1740,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) /* * Join a socket to a group */ -int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; -int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; -#ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; -#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1755,6 +1764,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; + struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ @@ -1765,7 +1775,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1823,12 +1833,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1995,6 +2006,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif @@ -2006,7 +2018,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2073,7 +2085,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) count++; } err = -ENOBUFS; - if (count >= sysctl_igmp_max_memberships) + if (count >= net->ipv4.sysctl_igmp_max_memberships) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) @@ -2245,7 +2257,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } /* else, add a new source to the filter */ - if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto done; } @@ -2918,6 +2930,12 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } + /* Sysctl initialization */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 641489148..bc5196ea1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/sock_reuseport.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || - (sk2->sk_state != TCP_TIME_WAIT && + rcu_access_pointer(sk->sk_reuseport_cb) || + (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, attempts = 5, port = snum; + int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret, attempts = 5; struct net *net = sock_net(sk); - int smallest_size = -1, smallest_rover; + int i, low, high, attempt_half; + struct inet_bind_bucket *tb; kuid_t uid = sock_i_uid(sk); - int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; + u32 remaining, offset; - local_bh_disable(); - if (!snum) { - int remaining, rover, low, high; + if (port) { +have_port: + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; + goto tb_not_found; + } again: - inet_get_local_port_range(net, &low, &high); - if (attempt_half) { - int half = low + ((high - low) >> 1); - - if (attempt_half == 1) - high = half; - else - low = half; - } - remaining = (high - low) + 1; - smallest_rover = rover = prandom_u32() % remaining + low; - - smallest_size = -1; - do { - if (inet_is_local_reserved_port(net, rover)) - goto next_nolock; - head = &hashinfo->bhash[inet_bhashfn(net, rover, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == rover) { - if (((tb->fastreuse > 0 && - sk->sk_reuse && - sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_rover = rover; - } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = rover; - goto tb_found; - } - goto next; + attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; +other_half_scan: + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + if (high - low < 4) + attempt_half = 0; + if (attempt_half) { + int half = low + (((high - low) >> 2) << 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; + + offset = prandom_u32() % remaining; + /* __inet_hash_connect() favors ports having @low parity + * We do the opposite to not pollute connect() users. + */ + offset |= 1U; + smallest_size = -1; + smallest_port = low; /* avoid compiler warning */ + +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (((tb->fastreuse > 0 && reuse) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_port = port; } - break; - next: - spin_unlock(&head->lock); - next_nolock: - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (remaining <= 0) { - if (smallest_size != -1) { - snum = smallest_rover; - goto have_snum; - } - if (attempt_half == 1) { - /* OK we now try the upper half of the range */ - attempt_half = 2; - goto again; + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) + goto tb_found; + goto next_port; } - goto fail; - } - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { -have_snum: - head = &hashinfo->bhash[inet_bhashfn(net, snum, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == snum) - goto tb_found; + goto tb_not_found; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } + + if (smallest_size != -1) { + port = smallest_port; + goto have_port; } - tb = NULL; - goto tb_not_found; + offset--; + if (!(offset & 1)) + goto other_parity_scan; + + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto other_half_scan; + } + return ret; + +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) + goto fail_unlock; tb_found: if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && + !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) { + smallest_size == -1) goto success; - } else { - ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { - if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size != -1 && --attempts >= 0) { - spin_unlock(&head->lock); - goto again; - } - - goto fail_unlock; + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { + if ((reuse || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + smallest_size != -1 && --attempts >= 0) { + spin_unlock_bh(&head->lock); + goto again; } + goto fail_unlock; } - } -tb_not_found: - ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, - net, head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else + if (!reuse) tb->fastreuse = 0; + if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) + tb->fastreuseport = 0; + } else { + tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; tb->fastuid = uid; - } else - tb->fastreuseport = 0; - } else { - if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - if (tb->fastreuseport && - (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + } else { tb->fastreuseport = 0; + } } success: if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); @@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); #define AF_INET_FAMILY(fam) true #endif -/* Only thing we need from tcp.h */ -extern int sysctl_tcp_synack_retries; - - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; + struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; int qlen, expire = 0, resend = 0; @@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data) if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); - return 0; + if (likely(!err)) + return 0; } sk->sk_state = TCP_CLOSE; - return -EADDRINUSE; + return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 6029157a1..5fdb02f55 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *sk; if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, + sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, @@ -879,6 +879,7 @@ next_normal: } spin_unlock_bh(lock); + cond_resched(); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc598079..0d9e9d7bb 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -229,6 +233,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -246,11 +259,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -449,32 +464,76 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + inet_csk(sk2)->icsk_bind_hash == tb && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +552,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -506,106 +567,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->inet_num; + struct inet_timewait_sock *tw = NULL; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; + int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + struct inet_bind_bucket *tb; + u32 remaining, offset; + int ret, i, low, high; + static u32 hint; + + if (port) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + inet_ehash_nolisten(sk, NULL); + spin_unlock_bh(&head->lock); + return 0; + } + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = check_established(death_row, sk, port, NULL); + local_bh_enable(); + return ret; + } - if (!snum) { - int i, remaining, low, high, port; - static u32 hint; - u32 offset = hint + port_offset; - struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; - inet_get_local_port_range(net, &low, &high); - remaining = (high - low) + 1; + offset = (hint + port_offset) % remaining; + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ + offset &= ~1U; +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); - /* By starting with offset being an even number, - * we tend to leave about 50% of ports for other uses, - * like bind(0). + /* Does not bother with rcv_saddr checks, because + * the established check is already unique enough. */ - offset &= ~1; - - local_bh_disable(); - for (i = 0; i < remaining; i++) { - port = low + (i + offset) % remaining; - if (inet_is_local_reserved_port(net, port)) - continue; - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && - tb->port == port) { - if (tb->fastreuse >= 0 || - tb->fastreuseport >= 0) - goto next_port; - WARN_ON(hlist_empty(&tb->owners)); - if (!check_established(death_row, sk, - port, &tw)) - goto ok; + inet_bind_bucket_for_each(tb, &head->chain) { + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; - } + WARN_ON(hlist_empty(&tb->owners)); + if (!check_established(death_row, sk, + port, &tw)) + goto ok; + goto next_port; } - - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - tb->fastreuseport = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += (i + 2) & ~1; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) { + spin_unlock_bh(&head->lock); + return -ENOMEM; } - if (tw) - inet_twsk_bind_unhash(tw, hinfo); - spin_unlock(&head->lock); + tb->fastreuse = -1; + tb->fastreuseport = -1; + goto ok; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } - if (tw) - inet_twsk_deschedule_put(tw); + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; - ret = 0; - goto out; - } + return -EADDRNOTAVAIL; - head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = check_established(death_row, sk, snum, NULL); -out: - local_bh_enable(); - return ret; +ok: + hint += i + 2; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->inet_sport = htons(port); + inet_ehash_nolisten(sk, (struct sock *)tw); } + if (tw) + inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + if (tw) + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c deleted file mode 100644 index f17ea49b2..000000000 --- a/net/ipv4/inet_lro.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - * linux/net/ipv4/inet_lro.c - * - * Large Receive Offload (ipv4 / tcp) - * - * (C) Copyright IBM Corp. 2007 - * - * Authors: - * Jan-Bernd Themann <themann@de.ibm.com> - * Christoph Raisch <raisch@de.ibm.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/module.h> -#include <linux/if_vlan.h> -#include <linux/inet_lro.h> -#include <net/checksum.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); -MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); - -#define TCP_HDR_LEN(tcph) (tcph->doff << 2) -#define IP_HDR_LEN(iph) (iph->ihl << 2) -#define TCP_PAYLOAD_LENGTH(iph, tcph) \ - (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) - -#define IPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_W_TIMESTAMP 8 - -#define LRO_MAX_PG_HLEN 64 - -#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } - -/* - * Basic tcp checks whether packet is suitable for LRO - */ - -static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, - int len, const struct net_lro_desc *lro_desc) -{ - /* check ip header: don't aggregate padded frames */ - if (ntohs(iph->tot_len) != len) - return -1; - - if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) - return -1; - - if (iph->ihl != IPH_LEN_WO_OPTIONS) - return -1; - - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || - tcph->rst || tcph->syn || tcph->fin) - return -1; - - if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) - return -1; - - if (tcph->doff != TCPH_LEN_WO_OPTIONS && - tcph->doff != TCPH_LEN_W_TIMESTAMP) - return -1; - - /* check tcp options (only timestamp allowed) */ - if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { - __be32 *topt = (__be32 *)(tcph + 1); - - if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return -1; - - /* timestamp should be in right order */ - topt++; - if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), - ntohl(*topt))) - return -1; - - /* timestamp reply should not be zero */ - topt++; - if (*topt == 0) - return -1; - } - - return 0; -} - -static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) -{ - struct iphdr *iph = lro_desc->iph; - struct tcphdr *tcph = lro_desc->tcph; - __be32 *p; - __wsum tcp_hdr_csum; - - tcph->ack_seq = lro_desc->tcp_ack; - tcph->window = lro_desc->tcp_window; - - if (lro_desc->tcp_saw_tstamp) { - p = (__be32 *)(tcph + 1); - *(p+2) = lro_desc->tcp_rcv_tsecr; - } - - csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); - iph->tot_len = htons(lro_desc->ip_tot_len); - - tcph->check = 0; - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); - lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); - tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - lro_desc->ip_tot_len - - IP_HDR_LEN(iph), IPPROTO_TCP, - lro_desc->data_csum); -} - -static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) -{ - __wsum tcp_csum; - __wsum tcp_hdr_csum; - __wsum tcp_ps_hdr_csum; - - tcp_csum = ~csum_unfold(tcph->check); - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); - - tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - len + TCP_HDR_LEN(tcph), - IPPROTO_TCP, 0); - - return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), - tcp_ps_hdr_csum); -} - -static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - int nr_frags; - __be32 *ptr; - u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - nr_frags = skb_shinfo(skb)->nr_frags; - lro_desc->parent = skb; - lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); - lro_desc->iph = iph; - lro_desc->tcph = tcph; - lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; - lro_desc->tcp_ack = tcph->ack_seq; - lro_desc->tcp_window = tcph->window; - - lro_desc->pkt_aggr_cnt = 1; - lro_desc->ip_tot_len = ntohs(iph->tot_len); - - if (tcph->doff == 8) { - ptr = (__be32 *)(tcph+1); - lro_desc->tcp_saw_tstamp = 1; - lro_desc->tcp_rcv_tsval = *(ptr+1); - lro_desc->tcp_rcv_tsecr = *(ptr+2); - } - - lro_desc->mss = tcp_data_len; - lro_desc->active = 1; - - lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, - tcp_data_len); -} - -static inline void lro_clear_desc(struct net_lro_desc *lro_desc) -{ - memset(lro_desc, 0, sizeof(struct net_lro_desc)); -} - -static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, - struct tcphdr *tcph, int tcp_data_len) -{ - struct sk_buff *parent = lro_desc->parent; - __be32 *topt; - - lro_desc->pkt_aggr_cnt++; - lro_desc->ip_tot_len += tcp_data_len; - lro_desc->tcp_next_seq += tcp_data_len; - lro_desc->tcp_window = tcph->window; - lro_desc->tcp_ack = tcph->ack_seq; - - /* don't update tcp_rcv_tsval, would not work with PAWS */ - if (lro_desc->tcp_saw_tstamp) { - topt = (__be32 *) (tcph + 1); - lro_desc->tcp_rcv_tsecr = *(topt + 2); - } - - lro_desc->data_csum = csum_block_add(lro_desc->data_csum, - lro_tcp_data_csum(iph, tcph, - tcp_data_len), - parent->len); - - parent->len += tcp_data_len; - parent->data_len += tcp_data_len; - if (tcp_data_len > lro_desc->mss) - lro_desc->mss = tcp_data_len; -} - -static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *parent = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb_pull(skb, (skb->len - tcp_data_len)); - parent->truesize += skb->truesize; - - if (lro_desc->last_skb) - lro_desc->last_skb->next = skb; - else - skb_shinfo(parent)->frag_list = skb; - - lro_desc->last_skb = skb; -} - - -static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, - struct iphdr *iph, - struct tcphdr *tcph) -{ - if ((lro_desc->iph->saddr != iph->saddr) || - (lro_desc->iph->daddr != iph->daddr) || - (lro_desc->tcph->source != tcph->source) || - (lro_desc->tcph->dest != tcph->dest)) - return -1; - return 0; -} - -static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_arr, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc = NULL; - struct net_lro_desc *tmp; - int max_desc = lro_mgr->max_desc; - int i; - - for (i = 0; i < max_desc; i++) { - tmp = &lro_arr[i]; - if (tmp->active) - if (!lro_check_tcp_conn(tmp, iph, tcph)) { - lro_desc = tmp; - goto out; - } - } - - for (i = 0; i < max_desc; i++) { - if (!lro_arr[i].active) { - lro_desc = &lro_arr[i]; - goto out; - } - } - - LRO_INC_STATS(lro_mgr, no_desc); -out: - return lro_desc; -} - -static void lro_flush(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_desc) -{ - if (lro_desc->pkt_aggr_cnt > 1) - lro_update_tcp_ip_header(lro_desc); - - skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(lro_desc->parent); - else - netif_rx(lro_desc->parent); - - LRO_INC_STATS(lro_mgr, flushed); - lro_clear_desc(lro_desc); -} - -static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, - void *priv) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - u64 flags; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_skb_header || - lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) - goto out; - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) - goto out; - - skb->ip_summed = lro_mgr->ip_summed_aggr; - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return 0; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) - goto out2; - - lro_add_packet(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return 0; - -out2: /* send aggregated SKBs to stack */ - lro_flush(lro_mgr, lro_desc); - -out: - return 1; -} - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - void *priv) -{ - if (__lro_proc_skb(lro_mgr, skb, priv)) { - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); - } -} -EXPORT_SYMBOL(lro_receive_skb); - -void lro_flush_all(struct net_lro_mgr *lro_mgr) -{ - int i; - struct net_lro_desc *lro_desc = lro_mgr->lro_arr; - - for (i = 0; i < lro_mgr->max_desc; i++) { - if (lro_desc[i].active) - lro_flush(lro_mgr, &lro_desc[i]); - } -} -EXPORT_SYMBOL(lro_flush_all); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index da0d7ce85..af18f1e48 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(opt->optlen)) ip_forward_options(skb); - skb_sender_cpu_clear(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 187c6fcc3..efbd47d1a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -54,8 +54,6 @@ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64; static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb @@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; - qp->peer = sysctl_ipfrag_max_dist ? + qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } @@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = sysctl_ipfrag_max_dist; + unsigned int max = qp->q.net->max_dist; unsigned int start, end; int rc; @@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "ipfrag_max_dist", + .data = &init_net.ipv4.frags.max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, { } }; @@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero - }, { } }; @@ -790,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[3].data = &net->ipv4.frags.max_dist; } hdr = register_net_sysctl(net, "net/ipv4", table); @@ -865,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.frags.max_dist = 64; + res = inet_frags_init_net(&net->ipv4.frags); if (res) return res; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 41ba68de4..4cc84212c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -179,6 +179,7 @@ static __be16 tnl_flags_to_gre_flags(__be16 tflags) return flags; } +/* Fills in tpi and returns header length to be pulled. */ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { @@ -238,7 +239,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return iptunnel_pull_header(skb, hdr_len, tpi->proto); + return hdr_len; } static void ipgre_err(struct sk_buff *skb, u32 info, @@ -341,7 +342,7 @@ static void gre_err(struct sk_buff *skb, u32 info) struct tnl_ptk_info tpi; bool csum_err = false; - if (parse_gre_header(skb, &tpi, &csum_err)) { + if (parse_gre_header(skb, &tpi, &csum_err) < 0) { if (!csum_err) /* ignore csum errors. */ return; } @@ -397,7 +398,10 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { - skb_pop_mac_header(skb); + if (tunnel->dev->type != ARPHRD_NONE) + skb_pop_mac_header(skb); + else + skb_reset_mac_header(skb); if (tunnel->collect_md) { __be16 flags; __be64 tun_id; @@ -419,6 +423,7 @@ static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; bool csum_err = false; + int hdr_len; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { @@ -428,7 +433,10 @@ static int gre_rcv(struct sk_buff *skb) } #endif - if (parse_gre_header(skb, &tpi, &csum_err) < 0) + hdr_len = parse_gre_header(skb, &tpi, &csum_err); + if (hdr_len < 0) + goto drop; + if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0) goto drop; if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) @@ -440,6 +448,17 @@ drop: return 0; } +static __sum16 gre_checksum(struct sk_buff *skb) +{ + __wsum csum; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csum = lco_csum(skb); + else + csum = skb_checksum(skb, 0, skb->len, 0); + return csum_fold(csum); +} + static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { @@ -467,8 +486,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); + *(__sum16 *)ptr = gre_checksum(skb); } } } @@ -493,8 +511,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool csum) { - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); + return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static struct rtable *gre_get_rt(struct sk_buff *skb, @@ -514,15 +531,17 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, return ip_route_output_key(net, fl); } -static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; + struct rtable *rt = NULL; struct flowi4 fl; - struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; + bool use_cache; int err; tun_info = skb_tunnel_info(skb); @@ -531,9 +550,17 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); + if (!rt) { + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl.saddr); + } tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); @@ -557,7 +584,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) } flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -598,7 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, const struct iphdr *tnl_params; if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; } @@ -642,7 +669,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } @@ -844,9 +871,16 @@ static void __gre_tunnel_init(struct net_device *dev) dev->hw_features |= GRE_FEATURES; if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported. */ - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + (tunnel->encap.type == TUNNEL_ENCAP_NONE)) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + /* Can use a lockless transmit, unless we generate * output sequences */ @@ -868,7 +902,7 @@ static int ipgre_tunnel_init(struct net_device *dev) netif_keep_dst(dev); dev->addr_len = 4; - if (iph->daddr) { + if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) @@ -877,8 +911,9 @@ static int ipgre_tunnel_init(struct net_device *dev) dev->header_ops = &ipgre_header_ops; } #endif - } else + } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + } return ip_tunnel_init(dev); } @@ -921,6 +956,11 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; + if (data[IFLA_GRE_COLLECT_METADATA] && + data[IFLA_GRE_ENCAP_TYPE] && + nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) + return -EINVAL; + return 0; } @@ -994,6 +1034,8 @@ static void ipgre_netlink_parms(struct net_device *dev, struct ip_tunnel *t = netdev_priv(dev); t->collect_md = true; + if (dev->type == ARPHRD_IPGRE) + dev->type = ARPHRD_NONE; } } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d77eb0c3b..e3d782746 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -308,15 +308,12 @@ drop: return true; } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); - static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { @@ -362,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); - } else if (rt->rt_type == RTN_BROADCAST) + } else if (rt->rt_type == RTN_BROADCAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + } else if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + /* RFC 1122 3.3.6: + * + * When a host sends a datagram to a link-layer broadcast + * address, the IP destination address MUST be a legal IP + * broadcast or IP multicast address. + * + * A host SHOULD silently discard a datagram that is received + * via a link-layer broadcast (see Section 2.4) but does not + * specify an IP multicast or broadcast destination address. + * + * This doesn't explicitly say L2 *broadcast*, but broadcast is + * in a way a form of multicast and the most common use case for + * this is 802.11 protecting against cross-station spoofing (the + * so-called "hole-196" attack) so do it for both. + */ + if (in_dev && + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) + goto drop; + } return dst_input(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bd2467923..4d158ff1d 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, if (opt->ts_needaddr) ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); if (opt->ts_needtime) { - struct timespec tv; __be32 midtime; - getnstimeofday(&tv); - midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); + + midtime = inet_current_timestamp(); memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); } return; @@ -415,11 +414,10 @@ int ip_options_compile(struct net *net, break; } if (timeptr) { - struct timespec tv; - u32 midtime; - getnstimeofday(&tv); - midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; - put_unaligned_be32(midtime, timeptr); + __be32 midtime; + + midtime = inet_current_timestamp(); + memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 565bf64b2..124bf0a66 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -79,9 +79,6 @@ #include <linux/netlink.h> #include <linux/tcp.h> -int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; -EXPORT_SYMBOL(sysctl_ip_default_ttl); - static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a50124260..035ad645a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -573,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); int val = 0, err; bool needs_rtnl = setsockopt_needs_rtnl(optname); @@ -912,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > sysctl_igmp_max_msf) { + msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) { kfree(msf); err = -ENOBUFS; break; @@ -1067,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sysctl_igmp_max_msf) { + gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto mc_msf_out; } @@ -1342,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->tos; break; case IP_TTL: + { + struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : + net->ipv4.sysctl_ip_default_ttl : inet->uc_ttl); break; + } case IP_HDRINCL: val = inet->hdrincl; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 336e6892a..a69ed94bd 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst, __be32 saddr) -{ - struct dst_entry *old_dst; - - dst_clone(dst); - old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); - dst_release(old_dst); - idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, - struct dst_entry *dst, __be32 saddr) -{ - __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ - tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ - int i; - - for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, - u32 cookie, __be32 *saddr) -{ - struct ip_tunnel_dst *idst; - struct dst_entry *dst; - - rcu_read_lock(); - idst = raw_cpu_ptr(t->dst_cache); - dst = rcu_dereference(idst->dst); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, cookie)) { - *saddr = idst->saddr; - } else { - tunnel_dst_reset(t); - dst_release(dst); - dst = NULL; - } - } - rcu_read_unlock(); - return (struct rtable *)dst; -} - static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { @@ -381,11 +326,12 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; + + dst_cache_reset(&tunnel->dst_cache); } if (!tdev && tunnel->parms.link) @@ -731,7 +677,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : + NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -741,7 +688,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { @@ -837,7 +785,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } @@ -976,7 +924,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1170,15 +1118,15 @@ int ip_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1208,7 +1156,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(itn, netdev_priv(dev)); - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 859d415c0..6165f30c4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -109,14 +110,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->protocol = inner_proto; } - nf_reset(skb); - secpath_reset(skb); skb_clear_hash_if_not_l4(skb); - skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); - skb->pkt_type = PACKET_HOST; - return 0; + skb_scrub_packet(skb, xnet); + + return iptunnel_pull_offloads(skb); } EXPORT_SYMBOL_GPL(iptunnel_pull_header); @@ -148,7 +147,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, int gso_type_mask) { int err; @@ -166,20 +164,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } - /* If packet is not gso and we are resolving any partial checksum, - * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL - * on the outer header without confusing devices that implement - * NETIF_F_IP_CSUM with encapsulation. - */ - if (csum_help) - skb->encapsulation = 0; - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; + /* We clear encapsulation here to prevent badly-written + * drivers potentially deciding to offload an inner checksum + * if we set CHECKSUM_PARTIAL on the outer header. + * This should go away when the drivers are all fixed. + */ + skb->encapsulation = 0; + } return skb; error: @@ -379,8 +372,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || - nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || - nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -406,6 +399,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { void __init ip_tunnel_core_init(void) { + /* If you land here, make sure whether increasing ip_tunnel_info's + * options_len is a reasonable choice with its usage in front ends + * (f.e., it's part of flow keys, etc). + */ + BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5cf10b777..a917903d5 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -156,6 +156,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int err; + int mtu; if (!dst) { dev->stats.tx_carrier_errors++; @@ -192,6 +193,23 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_count = 0; } + mtu = dst_mtu(dst); + if (skb->len > mtu) { + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->protocol == htons(ETH_P_IP)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } else { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + dst_release(dst); + goto tx_error; + } + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4044da61e..ec51d0216 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb) if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b488cac9c..4133b0f51 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline bool unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; - return memcmp(arp, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct arpt_entry) && + memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->arp)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -474,14 +474,12 @@ next: return 1; } -static inline int check_entry(const struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e) { const struct xt_entry_target *t; - if (!arp_checkentry(&e->arp)) { - duprintf("arp_tables: arp check failed %p %s.\n", e, name); + if (!arp_checkentry(&e->arp)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) struct xt_target *target; int ret; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -557,7 +551,7 @@ static bool check_underflow(const struct arpt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->arp)) + if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -576,9 +570,11 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -590,6 +586,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; @@ -969,6 +969,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, sizeof(struct arpt_get_entries) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1233,7 +1234,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1246,7 +1248,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct arpt_entry *)e, name); + ret = check_entry((struct arpt_entry *)e); if (ret) return ret; @@ -1662,6 +1664,7 @@ static int compat_get_entries(struct net *net, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); @@ -1780,9 +1783,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct xt_table *arpt_register_table(struct net *net, - const struct xt_table *table, - const struct arpt_replace *repl) +static void __arpt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct arpt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int arpt_register_table(struct net *net, + const struct xt_table *table, + const struct arpt_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -1791,10 +1814,8 @@ struct xt_table *arpt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -1809,30 +1830,28 @@ struct xt_table *arpt_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __arpt_unregister_table(new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void arpt_unregister_table(struct xt_table *table) +void arpt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct arpt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __arpt_unregister_table(table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 1897ee160..8f8713b43 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) +static int __net_init arptable_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, .priority = NF_IP_PRI_FILTER, + .table_init = arptable_filter_table_init, }; /* The work comes in here from netfilter.c */ @@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_net_init(struct net *net) +static int __net_init arptable_filter_table_init(struct net *net) { struct arpt_replace *repl; - + int err; + + if (net->ipv4.arptable_filter) + return 0; + repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - net->ipv4.arptable_filter = - arpt_register_table(net, &packet_filter, repl); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, + &net->ipv4.arptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); + return err; } static void __net_exit arptable_filter_net_exit(struct net *net) { - arpt_unregister_table(net->ipv4.arptable_filter); + if (!net->ipv4.arptable_filter) + return; + arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops); + net->ipv4.arptable_filter = NULL; } static struct pernet_operations arptable_filter_net_ops = { - .init = arptable_filter_net_init, .exit = arptable_filter_net_exit, }; @@ -62,26 +71,29 @@ static int __init arptable_filter_init(void) { int ret; + arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) + return PTR_ERR(arpfilter_ops); + ret = register_pernet_subsys(&arptable_filter_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(arpfilter_ops); return ret; + } - arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); - if (IS_ERR(arpfilter_ops)) { - ret = PTR_ERR(arpfilter_ops); - goto cleanup_table; + ret = arptable_filter_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); } - return ret; -cleanup_table: - unregister_pernet_subsys(&arptable_filter_net_ops); return ret; } static void __exit arptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, arpfilter_ops); unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); } module_init(arptable_filter_init); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6..631c100a1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; - return memcmp(ip, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ipt_entry) && + memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ip)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->ip)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -569,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e) { const struct xt_entry_target *t; - if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, name); + if (!ip_checkentry(&e->ip)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -666,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -721,7 +714,7 @@ static bool check_underflow(const struct ipt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ip)) + if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -741,9 +734,11 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -755,6 +750,10 @@ check_entry_size_and_hooks(struct ipt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; @@ -1157,6 +1156,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1493,7 +1493,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1506,7 +1507,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ipt_entry *)e, name); + ret = check_entry((struct ipt_entry *)e); if (ret) return ret; @@ -1935,6 +1936,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); @@ -2062,9 +2064,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ipt_register_table(struct net *net, - const struct xt_table *table, - const struct ipt_replace *repl) +static void __ipt_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ipt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ipt_register_table(struct net *net, const struct xt_table *table, + const struct ipt_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2073,10 +2093,8 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2091,30 +2109,27 @@ struct xt_table *ipt_register_table(struct net *net, goto out_free; } - return new_table; + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ipt_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ipt_unregister_table(struct net *net, struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ipt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 5fdc55651..db5b87509 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,8 @@ #include <net/netfilter/nf_conntrack_synproxy.h> static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) { struct iphdr *iph; @@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = sysctl_ip_default_ttl; + iph->ttl = net->ipv4.sysctl_ip_default_ttl; iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; @@ -39,14 +40,12 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) } static void -synproxy_send_tcp(const struct synproxy_net *snet, +synproxy_send_tcp(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { - struct net *net = nf_ct_net(snet->tmpl); - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -71,7 +70,7 @@ free_nskb: } static void -synproxy_send_client_synack(const struct synproxy_net *snet, +synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -90,7 +89,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -108,15 +107,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_syn(const struct synproxy_net *snet, +synproxy_send_server_syn(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; @@ -131,7 +131,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -152,12 +152,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_ack(const struct synproxy_net *snet, +synproxy_send_server_ack(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) @@ -176,7 +176,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -192,11 +192,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void -synproxy_send_client_ack(const struct synproxy_net *snet, +synproxy_send_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -214,7 +214,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -230,15 +230,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static bool -synproxy_recv_client_ack(const struct synproxy_net *snet, +synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); @@ -254,7 +255,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet, if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); - synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + synproxy_send_server_syn(net, skb, th, opts, recv_seq); return true; } @@ -262,7 +263,8 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(par->net); + struct net *net = par->net; + struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -291,12 +293,12 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(snet, skb, th, &opts); + synproxy_send_client_synack(net, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); return NF_DROP; } @@ -307,7 +309,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(nhs->net); + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -364,7 +367,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ - if (synproxy_recv_client_ack(snet, skb, th, &opts, + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq) + 1)) this_cpu_inc(snet->stats->cookie_retrans); @@ -390,12 +393,12 @@ static unsigned int ipv4_synproxy_hook(void *priv, XT_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(snet, state, skb, th, &opts); + synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(snet, skb, th, &opts); + synproxy_send_client_ack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 397ef2dd1..7667f223d 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_filter_table_init(struct net *net); static const struct xt_table packet_filter = { .name = "filter", @@ -30,6 +31,7 @@ static const struct xt_table packet_filter = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, + .table_init = iptable_filter_table_init, }; static unsigned int @@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static bool forward = true; +static bool forward __read_mostly = true; module_param(forward, bool, 0000); -static int __net_init iptable_filter_net_init(struct net *net) +static int __net_init iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; + int err; + + if (net->ipv4.iptable_filter) + return 0; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv4.iptable_filter = - ipt_register_table(net, &packet_filter, repl); + err = ipt_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv4.iptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); + return err; +} + +static int __net_init iptable_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return iptable_filter_table_init(net); + + return 0; } static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_filter); + if (!net->ipv4.iptable_filter) + return; + ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); + net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { @@ -82,24 +99,21 @@ static int __init iptable_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - unregister_pernet_subsys(&iptable_filter_net_ops); - } + kfree(filter_ops); return ret; } static void __exit iptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); + kfree(filter_ops); } module_init(iptable_filter_init); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index ba5d392a1..57fc97cda 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init iptable_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, + .table_init = iptable_mangle_table_init, }; static unsigned int @@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv, } static struct nf_hook_ops *mangle_ops __read_mostly; - -static int __net_init iptable_mangle_net_init(struct net *net) +static int __net_init iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_mangle) + return 0; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_mangle = - ipt_register_table(net, &packet_mangler, repl); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv4.iptable_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); + return ret; } static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_mangle); + if (!net->ipv4.iptable_mangle) + return; + ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); + net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { - .init = iptable_mangle_net_init, .exit = iptable_mangle_net_exit, }; @@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + return ret; + } + ret = register_pernet_subsys(&iptable_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; + } - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); + ret = iptable_mangle_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } return ret; @@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } module_init(iptable_mangle_init); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ae2cd2752..138a24bc7 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -18,6 +18,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init iptable_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, + .table_init = iptable_nat_table_init, }; static unsigned int iptable_nat_do_chain(void *priv, @@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, }; -static int __net_init iptable_nat_net_init(struct net *net) +static int __net_init iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.nat_table) + return 0; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, + nf_nat_ipv4_ops, &net->ipv4.nat_table); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.nat_table); + return ret; } static void __net_exit iptable_nat_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.nat_table); + if (!net->ipv4.nat_table) + return; + ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); + net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { - .init = iptable_nat_net_init, .exit = iptable_nat_net_exit, }; static int __init iptable_nat_init(void) { - int err; + int ret = register_pernet_subsys(&iptable_nat_net_ops); - err = register_pernet_subsys(&iptable_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&iptable_nat_net_ops); -err1: - return err; + ret = iptable_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&iptable_nat_net_ops); + return ret; } static void __exit iptable_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); unregister_pernet_subsys(&iptable_nat_net_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1ba02811a..2642ecd26 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -10,12 +10,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW, + .table_init = iptable_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_net_init(struct net *net) +static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_raw) + return 0; repl = ipt_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_raw = - ipt_register_table(net, &packet_raw, repl); + ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv4.iptable_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); + return ret; } static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_raw); + if (!net->ipv4.iptable_raw) + return; + ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); + net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { - .init = iptable_raw_net_init, .exit = iptable_raw_net_exit, }; @@ -61,15 +70,20 @@ static int __init iptable_raw_init(void) { int ret; + rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&iptable_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; + } - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); + ret = iptable_raw_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } return ret; @@ -77,8 +91,8 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c2e23d5e9..ff226596e 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init iptable_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_SECURITY, + .table_init = iptable_security_table_init, }; static unsigned int @@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_net_init(struct net *net) +static int __net_init iptable_security_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_security) + return 0; repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_security = - ipt_register_table(net, &security_table, repl); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv4.iptable_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); + return ret; } static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_security); + if (!net->ipv4.iptable_security) + return; + + ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); + net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { - .init = iptable_security_net_init, .exit = iptable_security_net_exit, }; @@ -78,27 +88,29 @@ static int __init iptable_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; + ret = iptable_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); + } -cleanup_table: - unregister_pernet_subsys(&iptable_security_net_ops); return ret; } static void __exit iptable_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a04dee536..d88da36b3 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, err = ip_defrag(net, skb, user); local_bh_enable(); - if (!err) { - ip_send_check(ip_hdr(skb)); + if (!err) skb->ignore_df = 1; - } return err; } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 61c7cc22e..f8aad03d6 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { - const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - (!skb->dev || skb->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - ip_hdrlen(skb); - skb->csum_offset = (void *)check - data; - *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, proto, 0); - } else { - *check = 0; - *check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, proto, - csum_partial(data, datalen, - 0)); - if (proto == IPPROTO_UDP && !*check) - *check = CSUM_MANGLED_0; - } + const struct iphdr *iph = ip_hdr(skb); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + + ip_hdrlen(skb); + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, + proto, 0); } else inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index b72ffc58e..51ced81b6 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d3a27165f..cf9700b1a 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ + + return 0; } void ping_unhash(struct sock *sk) @@ -1140,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations ping_v4_seq_ops = { - .show = ping_v4_seq_show, - .start = ping_v4_seq_start, - .next = ping_seq_next, - .stop = ping_seq_stop, -}; - static int ping_seq_open(struct inode *inode, struct file *file) { struct ping_seq_afinfo *afinfo = PDE_DATA(inode); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3abd9d7a3..9f665b63a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7113bae4e..8d22de740 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk) sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + + return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02c62299d..60398a937 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif } -static struct rtable *rt_dst_alloc(struct net_device *dev, - unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache) { struct rtable *rt; @@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, return rt; } +EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2045,6 +2046,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res, */ if (fi && res->prefixlen < 4) fi = NULL; + } else if ((type == RTN_LOCAL) && (orig_oif != 0) && + (orig_oif != dev_out->ifindex)) { + /* For local routes that require a particular output interface + * we do not want to cache the result. Caching the result + * causes incorrect behaviour when there are multiple source + * addresses on the interface, the end result being that if the + * intended recipient is waiting on that interface for the + * packet he won't receive it because it will be delivered on + * the loopback interface and the IP_PKTINFO ipi_ifindex will + * be set to the loopback interface as well. + */ + fi = NULL; } fnhe = NULL; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c49..4c04f0933 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,8 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -extern int sysctl_tcp_syncookies; - static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ @@ -50,8 +48,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv4_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) @@ -307,7 +304,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u8 rcv_wscale; struct flowi4 fl4; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4d367b413..1e1fe6086 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -283,31 +283,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_default_ttl", - .data = &sysctl_ip_default_ttl, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &ip_ttl_min, - .extra2 = &ip_ttl_max, - }, - { - .procname = "tcp_syn_retries", - .data = &sysctl_tcp_syn_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_syn_retries_min, - .extra2 = &tcp_syn_retries_max - }, - { - .procname = "tcp_synack_retries", - .data = &sysctl_tcp_synack_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -322,51 +297,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_early_demux", - .data = &sysctl_ip_early_demux, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ip_dynaddr", - .data = &sysctl_ip_dynaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_retries1", - .data = &sysctl_tcp_retries1, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra2 = &tcp_retr1_max - }, - { - .procname = "tcp_retries2", - .data = &sysctl_tcp_retries2, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_fin_timeout", - .data = &sysctl_tcp_fin_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#ifdef CONFIG_SYN_COOKIES - { - .procname = "tcp_syncookies", - .data = &sysctl_tcp_syncookies, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -415,30 +345,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "igmp_max_memberships", - .data = &sysctl_igmp_max_memberships, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "igmp_max_msf", - .data = &sysctl_igmp_max_msf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#ifdef CONFIG_IP_MULTICAST - { - .procname = "igmp_qrv", - .data = &sysctl_igmp_qrv, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, -#endif - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -460,13 +366,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_orphan_retries", - .data = &sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fack", .data = &sysctl_tcp_fack, .maxlen = sizeof(int), @@ -481,13 +380,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "tcp_reordering", - .data = &sysctl_tcp_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_reordering", .data = &sysctl_tcp_max_reordering, .maxlen = sizeof(int), @@ -517,13 +409,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, }, { - .procname = "tcp_notsent_lowat", - .data = &sysctl_tcp_notsent_lowat, - .maxlen = sizeof(sysctl_tcp_notsent_lowat), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), @@ -845,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_dynaddr", + .data = &init_net.ipv4.sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_early_demux", + .data = &init_net.ipv4.sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_default_ttl", + .data = &init_net.ipv4.sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_ttl_min, + .extra2 = &ip_ttl_max, + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, @@ -934,12 +842,36 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "igmp_link_local_mcast_reports", - .data = &sysctl_igmp_llm_reports, + .data = &init_net.ipv4.sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "igmp_max_memberships", + .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { + .procname = "igmp_max_msf", + .data = &init_net.ipv4.sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &init_net.ipv4.sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -960,6 +892,74 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "tcp_syn_retries", + .data = &init_net.ipv4.sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_syn_retries_min, + .extra2 = &tcp_syn_retries_max + }, + { + .procname = "tcp_synack_retries", + .data = &init_net.ipv4.sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_SYN_COOKIES + { + .procname = "tcp_syncookies", + .data = &init_net.ipv4.sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "tcp_reordering", + .data = &init_net.ipv4.sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retries1", + .data = &init_net.ipv4.sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra2 = &tcp_retr1_max + }, + { + .procname = "tcp_retries2", + .data = &init_net.ipv4.sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_orphan_retries", + .data = &init_net.ipv4.sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fin_timeout", + .data = &init_net.ipv4.sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_notsent_lowat", + .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -988,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_dynaddr = 0; + net->ipv4.sysctl_ip_early_demux = 1; + return 0; err_ports: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0da50fe0a..32c9d8c25 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -247,6 +247,7 @@ #define pr_fmt(fmt) "TCP: " fmt +#include <crypto/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> @@ -266,7 +267,6 @@ #include <linux/swap.h> #include <linux/cache.h> #include <linux/err.h> -#include <linux/crypto.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -283,8 +283,6 @@ #include <asm/unaligned.h> #include <net/busy_poll.h> -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; - int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; @@ -407,7 +405,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; u64_stats_init(&tp->syncp); - tp->reordering = sysctl_tcp_reordering; + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); @@ -559,20 +557,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -EINVAL; slow = lock_sock_fast(sk); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !tp->urg_data || - before(tp->urg_seq, tp->copied_seq) || - !before(tp->urg_seq, tp->rcv_nxt)) { - - answ = tp->rcv_nxt - tp->copied_seq; - - /* Subtract 1, if FIN was received */ - if (answ && sock_flag(sk, SOCK_DONE)) - answ--; - } else - answ = tp->urg_seq - tp->copied_seq; + answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: @@ -1467,8 +1452,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; @@ -1658,8 +1645,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -2364,6 +2353,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -2620,7 +2610,7 @@ stealth_integrity_out_1: case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > sysctl_tcp_fin_timeout / HZ) + else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) tp->linger2 = 0; else tp->linger2 = val * HZ; @@ -2749,6 +2739,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + int notsent_bytes; u64 rate64; u32 rate; @@ -2829,6 +2820,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; + + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + info->tcpi_notsent_bytes = max(0, notsent_bytes); + + info->tcpi_min_rtt = tcp_min_rtt(tp); + info->tcpi_data_segs_in = tp->data_segs_in; + info->tcpi_data_segs_out = tp->data_segs_out; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2837,6 +2835,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int val, len; if (get_user(len, optlen)) @@ -2871,12 +2870,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : sysctl_tcp_fin_timeout) / HZ; + val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, @@ -3053,17 +3052,26 @@ static bool tcp_md5sig_pool_populated = false; static void __tcp_alloc_md5sig_pool(void) { + struct crypto_ahash *hash; int cpu; + hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hash)) + return; + for_each_possible_cpu(cpu) { - if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) { - struct crypto_hash *hash; + struct ahash_request *req; - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hash)) - return; - per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; - } + if (per_cpu(tcp_md5sig_pool, cpu).md5_req) + continue; + + req = ahash_request_alloc(hash, GFP_KERNEL); + if (!req) + return; + + ahash_request_set_callback(req, 0, NULL, NULL); + + per_cpu(tcp_md5sig_pool, cpu).md5_req = req; } /* before setting tcp_md5sig_pool_populated, we must commit all writes * to memory. See smp_rmb() in tcp_get_md5sig_pool() @@ -3113,7 +3121,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, { struct scatterlist sg; struct tcphdr hdr; - int err; /* We are not allowed to change tcphdr, make a local copy */ memcpy(&hdr, th, sizeof(hdr)); @@ -3121,8 +3128,8 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, /* options aren't included in the hash */ sg_init_one(&sg, &hdr, sizeof(hdr)); - err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); - return err; + ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr)); + return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_header); @@ -3131,7 +3138,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, { struct scatterlist sg; const struct tcphdr *tp = tcp_hdr(skb); - struct hash_desc *desc = &hp->md5_desc; + struct ahash_request *req = hp->md5_req; unsigned int i; const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; @@ -3141,7 +3148,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); - if (crypto_hash_update(desc, &sg, head_data_len)) + ahash_request_set_crypt(req, &sg, NULL, head_data_len); + if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { @@ -3151,7 +3159,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); - if (crypto_hash_update(desc, &sg, skb_frag_size(f))) + ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); + if (crypto_ahash_update(req)) return 1; } @@ -3168,7 +3177,8 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke struct scatterlist sg; sg_init_one(&sg, key->key, key->keylen); - return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); + return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 55be6ac70..cffd8f9ed 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -1,3 +1,4 @@ +#include <linux/crypto.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> @@ -124,6 +125,49 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } + +/* If an incoming SYN or SYNACK frame contains a payload and/or FIN, + * queue this additional data / FIN. + */ +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + skb_dst_drop(skb); + /* segs_in has been initialized to 1 in tcp_create_openreq_child(). + * Hence, reset segs_in to 0 before calling tcp_segs_in() + * to avoid double counting. Also, tcp_segs_in() expects + * skb->len to include the tcp_hdrlen. Hence, it should + * be called before __skb_pull(). + */ + tp->segs_in = 0; + tcp_segs_in(tp, skb); + __skb_pull(skb, tcp_hdrlen(skb)); + skb_set_owner_r(skb, sk); + + TCP_SKB_CB(skb)->seq++; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; + + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ + tp->bytes_received = skb->len; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_fin(sk); +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct dst_entry *dst, @@ -132,7 +176,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; - u32 end_seq; bool own_req; req->num_retrans = 0; @@ -178,35 +221,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. - * We used to play tricky games with skb_get(). - * With lockless listener, it is a dead end. - * Do not think about it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now, - * (any reason not to?) but no need to queue the skb since - * there is no data. How about SYN+FIN? - */ - end_seq = TCP_SKB_CB(skb)->end_seq; - if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - if (likely(skb2)) { - skb_dst_drop(skb2); - __skb_pull(skb2, tcp_hdrlen(skb)); - skb_set_owner_r(skb2, child); - __skb_queue_tail(&child->sk_receive_queue, skb2); - tp->syn_data_acked = 1; - - /* u64_stats_update_begin(&tp->syncp) not needed here, - * as we certainly are not changing upper 32bit value (0) - */ - tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; - } else { - end_seq = TCP_SKB_CB(skb)->seq + 1; - } - } - tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + + tcp_fastopen_add_skb(child, skb); + + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b3a47bc08..131005f82 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -83,9 +83,7 @@ EXPORT_SYMBOL(sysctl_tcp_timestamps); int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; -int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; int sysctl_tcp_max_reordering __read_mostly = 300; -EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -129,6 +127,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) +#define REXMIT_NONE 0 /* no loss recovery to do */ +#define REXMIT_LOST 1 /* retransmit packets marked lost */ +#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ + /* Adapt the MSS value used to make delayed ack decision to the * real world. */ @@ -1213,6 +1215,7 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; + tp->delivered += pcount; /* Out-of-order packets delivered */ fack_count += pcount; @@ -1309,6 +1312,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); + tcp_skb_collapse_tstamp(prev, skb); tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1824,8 +1828,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) static void tcp_add_reno_sack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + tp->sacked_out++; tcp_check_reno_reordering(sk, 0); + if (tp->sacked_out > prior_sacked) + tp->delivered++; /* Some out-of-order packet is delivered */ tcp_verify_left_out(tp); } @@ -1837,6 +1845,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ + tp->delivered += max_t(int, acked - tp->sacked_out, 1); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -1876,6 +1885,7 @@ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ @@ -1926,9 +1936,9 @@ void tcp_enter_loss(struct sock *sk) * suggests that the degree of reordering is over-estimated. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= sysctl_tcp_reordering) + tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + net->ipv4.sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -2112,6 +2122,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; + int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2126,7 +2137,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) */ packets_out = tp->packets_out; if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. @@ -2470,14 +2481,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit, int flag) +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - int newly_acked_sacked = prior_unsacked - - (tp->packets_out - tp->sacked_out); if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; @@ -2495,7 +2504,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, } else { sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2540,7 +2550,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) +static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2554,8 +2564,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2665,7 +2673,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, + int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2687,10 +2696,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; - __tcp_push_pending_frames(sk, tcp_current_mss(sk), - TCP_NAGLE_OFF); - if (after(tp->snd_nxt, tp->high_seq)) - return; /* Step 2.b */ + /* Step 2.b. Try send new data (but deferred until cwnd + * is updated in tcp_ack()). Otherwise fall back to + * the conventional recovery. + */ + if (tcp_send_head(sk) && + after(tcp_wnd_end(tp), tp->snd_nxt)) { + *rexmit = REXMIT_NEW; + return; + } tp->frto = 0; } } @@ -2709,12 +2723,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked, int flag) +static bool tcp_try_undo_partial(struct sock *sk, const int acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2729,10 +2742,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * can undo. Otherwise we clock out new packets but do not * mark more packets lost or retransmit more. */ - if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); + if (tp->retrans_out) return true; - } if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; @@ -2751,21 +2762,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * - * Besides that it does CWND reduction, when packet loss is detected - * and changes state of machine. + * Besides that it updates the congestion state when packet loss or ECN + * is detected. But it does not reduce the cwnd, it is done by the + * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - const int prior_unsacked, - bool is_dupack, int flag) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2812,8 +2823,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, /* Use RACK to detect loss */ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) + tcp_rack_mark_lost(sk)) { flag |= FLAG_LOST_RETRANS; + *ack_flag |= FLAG_LOST_RETRANS; + } /* E. Process state. */ switch (icsk->icsk_ca_state) { @@ -2822,7 +2835,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) + if (tcp_try_undo_partial(sk, acked)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2834,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack); + tcp_process_loss(sk, flag, is_dupack, rexmit); if (icsk->icsk_ca_state != TCP_CA_Open && !(flag & FLAG_LOST_RETRANS)) return; @@ -2851,7 +2864,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, prior_unsacked); + tcp_try_to_open(sk, flag); return; } @@ -2873,8 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Kathleen Nichols' algorithm for tracking the minimum value of @@ -3090,7 +3102,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) + !before(shinfo->tskey, prior_snd_una) && + before(shinfo->tskey, tcp_sk(sk)->snd_una)) __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); } @@ -3099,7 +3112,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, + u32 prior_snd_una, int *acked, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3157,10 +3170,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_ORIG_SACK_ACKED; } - if (sacked & TCPCB_SACKED_ACKED) + if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; - else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } else if (tcp_is_sack(tp)) { + tp->delivered += acked_pcount; + if (!tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3269,6 +3285,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif + *acked = pkts_acked; return flag; } @@ -3302,21 +3319,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - if (tcp_in_cwnd_reduction(sk)) - return false; - /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; } +/* The "ultimate" congestion control function that aims to replace the rigid + * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). + * It's called toward the end of processing an ACK with precise rate + * information. All transmission or retransmission are delayed afterwards. + */ +static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, + int flag) +{ + if (tcp_in_cwnd_reduction(sk)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, acked_sacked, flag); + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + tcp_cong_avoid(sk, ack, acked_sacked); + } + tcp_update_pacing_rate(sk); +} + /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ @@ -3512,6 +3544,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags) icsk->icsk_ca_ops->in_ack_event(sk, flags); } +/* Congestion control has updated the cwnd already. So if we're in + * loss recovery then now we do any new sends (for FRTO) or + * retransmits (for CA_Loss or CA_recovery) that make sense. + */ +static void tcp_xmit_recovery(struct sock *sk, int rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (rexmit == REXMIT_NONE) + return; + + if (unlikely(rexmit == 2)) { + __tcp_push_pending_frames(sk, tcp_current_mss(sk), + TCP_NAGLE_OFF); + if (after(tp->snd_nxt, tp->high_seq)) + return; + tp->frto = 0; + } + tcp_xmit_retransmit_queue(sk); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3524,8 +3577,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - const int prior_unsacked = tp->packets_out - tp->sacked_out; + u32 prior_delivered = tp->delivered; int acked = 0; /* Number of packets newly acked */ + int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ sack_state.first_sackt.v64 = 0; @@ -3614,23 +3668,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); - acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3639,14 +3686,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_update_pacing_rate(sk); + tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3669,8 +3716,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_xmit_recovery(sk, rexmit); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -4042,7 +4089,7 @@ void tcp_reset(struct sock *sk) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ -static void tcp_fin(struct sock *sk) +void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5598,6 +5645,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + + tcp_fastopen_add_skb(sk, synack); + return false; } @@ -6204,9 +6254,10 @@ static bool tcp_syn_flood_action(const struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; + struct net *net = sock_net(sk); #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { + if (net->ipv4.sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = true; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6215,7 +6266,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && - sysctl_tcp_syncookies != 2 && + net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -6248,6 +6299,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; struct request_sock *req; @@ -6258,7 +6310,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * limitations, they conserve resources and peer is * evidently real one. */ - if ((sysctl_tcp_syncookies == 2 || + if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); if (!want_cookie) @@ -6324,7 +6376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } } /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && + else if (!net->ipv4.sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b8f3908dd..dfd153fbd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -82,7 +82,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/crypto.h> +#include <crypto/hash.h> #include <linux/scatterlist.h> int sysctl_tcp_tw_reuse __read_mostly; @@ -656,8 +656,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, - &tcp_hashinfo, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, + ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ @@ -879,7 +879,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } - #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of @@ -1053,21 +1052,22 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, bp->len = cpu_to_be16(nbytes); sg_init_one(&sg, bp, sizeof(*bp)); - return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + return crypto_ahash_update(hp->md5_req); } static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th) { struct tcp_md5sig_pool *hp; - struct hash_desc *desc; + struct ahash_request *req; hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; - desc = &hp->md5_desc; + req = hp->md5_req; - if (crypto_hash_init(desc)) + if (crypto_ahash_init(req)) goto clear_hash; if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) goto clear_hash; @@ -1075,7 +1075,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; - if (crypto_hash_final(desc, md5_hash)) + ahash_request_set_crypt(req, NULL, md5_hash, 0); + if (crypto_ahash_final(req)) goto clear_hash; tcp_put_md5sig_pool(); @@ -1093,7 +1094,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; - struct hash_desc *desc; + struct ahash_request *req; const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; @@ -1109,9 +1110,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; - desc = &hp->md5_desc; + req = hp->md5_req; - if (crypto_hash_init(desc)) + if (crypto_ahash_init(req)) goto clear_hash; if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) @@ -1122,7 +1123,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; - if (crypto_hash_final(desc, md5_hash)) + ahash_request_set_crypt(req, NULL, md5_hash, 0); + if (crypto_ahash_final(req)) goto clear_hash; tcp_put_md5sig_pool(); @@ -1612,7 +1614,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->sacked = 0; lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest); if (!sk) goto no_tcp_socket; @@ -1675,7 +1678,7 @@ process: sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); - tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1728,7 +1731,8 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, + &tcp_hashinfo, skb, + __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -2420,6 +2424,16 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; + net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; + net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; + net->ipv4.sysctl_tcp_syncookies = 1; + net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; + net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; + net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; + net->ipv4.sysctl_tcp_orphan_retries = 0; + net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index a726d7853..7b7eec439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; @@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk) if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && - tp->reordering != sysctl_tcp_reordering) + tp->reordering != net->ipv4.sysctl_tcp_reordering) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9b02af213..acb366dd6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -27,9 +27,6 @@ #include <net/inet_common.h> #include <net/xfrm.h> -int sysctl_tcp_syncookies __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_syncookies); - int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { @@ -815,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; - tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9864a2dba..773083b7f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->fin = th->psh = 0; th->check = newcheck; - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); seq += mss; @@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); out: return segs; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20f2812a9..c0c1dac81 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; -EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -1013,8 +1010,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - if (skb->len != tcp_header_size) + if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); + tp->data_segs_out += tcp_skb_pcount(skb); + } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, @@ -2449,6 +2448,20 @@ u32 __tcp_select_window(struct sock *sk) return window; } +void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb) +{ + const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); + u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; + + if (unlikely(tsflags)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + shinfo->tx_flags |= tsflags; + shinfo->tskey = next_shinfo->tskey; + } +} + /* Collapses two adjacent SKB's during retransmission. */ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { @@ -2492,6 +2505,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); + tcp_skb_collapse_tstamp(skb, next_skb); + sk_wmem_free_skb(sk, next_skb); } @@ -2632,8 +2647,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { - struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, - GFP_ATOMIC); + struct sk_buff *nskb; + + skb_mstamp_get(&skb->skb_mstamp); + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; } else { @@ -3491,6 +3508,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); unsigned long probe_max; int err; @@ -3504,7 +3522,7 @@ void tcp_send_probe0(struct sock *sk) } if (err <= 0) { - if (icsk->icsk_backoff < sysctl_tcp_retries2) + if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; probe_max = TCP_RTO_MAX; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index ebf5ff575..f6c50af24 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n) { const struct tcp_log *p = tcp_probe.log + tcp_probe.tail; - struct timespec tv - = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); + struct timespec64 ts + = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start)); return scnprintf(tbuf, n, "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long)tv.tv_sec, - (unsigned long)tv.tv_nsec, + (unsigned long)ts.tv_sec, + (unsigned long)ts.tv_nsec, &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a4730a28b..49bc474f8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,11 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; -int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; -int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; -int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_err(struct sock *sk) @@ -82,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* Calculate maximal number or retries on an orphaned socket. */ static int tcp_orphan_retries(struct sock *sk, bool alive) { - int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (sk->sk_err_soft && !alive) @@ -157,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int retry_until; bool do_reset, syn_set = false; @@ -169,10 +165,10 @@ static int tcp_write_timeout(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } - retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts with @@ -181,7 +177,7 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_data_acked && tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == sysctl_tcp_retries1) + if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -191,7 +187,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } - retry_until = sysctl_tcp_retries2; + retry_until = net->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -305,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk) (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) goto abort; - max_probes = sysctl_tcp_retries2; + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -332,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : - sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -360,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); if (tp->fastopen_rsk) { @@ -490,7 +487,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); out:; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eb8933bc0..a2e7f55a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -339,8 +339,13 @@ found: hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } @@ -356,8 +361,8 @@ EXPORT_SYMBOL(udp_lib_get_port); * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -848,32 +853,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, { struct udphdr *uh = udp_hdr(skb); - if (nocheck) + if (nocheck) { uh->check = 0; - else if (skb_is_gso(skb)) + } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v4_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp_set_csum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4c519c1dc..e330c0e56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -32,42 +32,65 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features), __be16 new_protocol, bool is_ipv6) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool remcsum, need_csum, offload_csum, ufo; struct sk_buff *segs = ERR_PTR(-EINVAL); + struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; - netdev_features_t enc_features; + u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum = !!(skb_shinfo(skb)->gso_type & - SKB_GSO_UDP_TUNNEL_CSUM); - bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); - bool offload_csum = false, dont_encap = (need_csum || remcsum); - - oldlen = (u16)~skb->len; + __wsum partial; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; + /* Adjust partial header checksum to negate old length. + * We cannot rely on the value contained in uh->len as it is + * possible that the actual value exceeds the boundaries of the + * 16 bit length field due to the header being added outside of an + * IP or IPv6 frame that was already limited to 64K - 1. + */ + partial = csum_sub(csum_unfold(uh->check), + (__force __wsum)htonl(skb->len)); + + /* setup inner skb. */ skb->encapsulation = 0; + SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); skb->encap_hdr_csum = need_csum; + + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + /* Try to offload checksum if possible */ offload_csum = !!(need_csum && - ((skb->dev->features & NETIF_F_HW_CSUM) || - (skb->dev->features & (is_ipv6 ? - NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); + (skb->dev->features & + (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : + (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags and + * instead set the flag based on our outer checksum offload value. + */ + if (remcsum || ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum || offload_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = gso_inner_segment(skb, enc_features); + segs = gso_inner_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -78,17 +101,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, udp_offset = outer_hlen - tnl_hlen; skb = segs; do { - struct udphdr *uh; - int len; - __be32 delta; + __be16 len; - if (dont_encap) { - skb->encapsulation = 0; + if (remcsum) skb->ip_summed = CHECKSUM_NONE; - } else { - /* Only set up inner headers if we might be offloading - * inner checksum. - */ + + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } @@ -96,43 +115,27 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = mac_len; skb->protocol = protocol; - skb_push(skb, outer_hlen); + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; + len = htons(skb->len - udp_offset); uh = udp_hdr(skb); - uh->len = htons(len); + uh->len = len; if (!need_csum) continue; - delta = htonl(oldlen + len); + uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len)); - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - if (offload_csum) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - } else if (remcsum) { - /* Need to calculate checksum from scratch, - * inner checksums are never when doing - * remote_checksum_offload. - */ - - skb->csum = skb_checksum(skb, udp_offset, - skb->len - udp_offset, - 0); - uh->check = csum_fold(skb->csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { + if (skb->encapsulation || !offload_csum) { uh->check = gso_make_checksum(skb, ~uh->check); - if (uh->check == 0) uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); } } while ((skb = skb->next)); out: @@ -235,6 +238,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ @@ -302,14 +312,14 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - if (NAPI_GRO_CB(skb)->udp_mark || + if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid)) goto out; - /* mark that this skb passed once through the udp gro layer */ - NAPI_GRO_CB(skb)->udp_mark = 1; + /* mark that this skb passed once through the tunnel gro layer */ + NAPI_GRO_CB(skb)->encap_mark = 1; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); @@ -389,6 +399,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) uh->len = newlen; + /* Set encapsulation before calling into inner gro_complete() functions + * to make them set up the inner offsets. + */ + skb->encapsulation = 1; + rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); @@ -411,9 +426,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) if (skb->remcsum_offload) skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM; - skb->encapsulation = 1; - skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr)); - return err; } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 40c897515..11e875ffd 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -207,6 +207,7 @@ config IPV6_NDISC_NODETYPE config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL + select DST_CACHE ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index bdd7eac43..8ec4b3089 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; /* Check if a valid qdisc is available */ @@ -471,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ + bool all = false; - /* type -1 is used for ALL */ - if (type == -1 || type == NETCONFA_FORWARDING) + if (type == NETCONFA_ALL) + all = true; + + if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); #ifdef CONFIG_IPV6_MROUTE - if (type == -1 || type == NETCONFA_MC_FORWARDING) + if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); #endif - if (type == -1 || type == NETCONFA_PROXY_NEIGH) + if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; @@ -495,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, { struct nlmsghdr *nlh; struct netconfmsg *ncm; + bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; + if (type == NETCONFA_ALL) + all = true; + ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET6; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; - /* type -1 is used for ALL */ - if ((type == -1 || type == NETCONFA_FORWARDING) && + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) goto nla_put_failure; #ifdef CONFIG_IPV6_MROUTE - if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, devconf->mc_forwarding) < 0) goto nla_put_failure; #endif - if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, devconf->ignore_routes_with_linkdown) < 0) goto nla_put_failure; @@ -607,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); if (!skb) goto errout; err = inet6_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, - -1); + NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -658,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) { + NETCONFA_ALL) < 0) { rcu_read_unlock(); goto done; } @@ -674,7 +682,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -685,7 +693,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -3168,10 +3176,60 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +static int fixup_permanent_addr(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + if (!ifp->rt) { + struct rt6_info *rt; + + rt = addrconf_dst_alloc(idev, &ifp->addr, false); + if (unlikely(IS_ERR(rt))) + return PTR_ERR(rt); + + ifp->rt = rt; + } + + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + idev->dev, 0, 0); + } + + addrconf_dad_start(ifp); + + return 0; +} + +static void addrconf_permanent_addr(struct net_device *dev) +{ + struct inet6_ifaddr *ifp, *tmp; + struct inet6_dev *idev; + + idev = __in6_dev_get(dev); + if (!idev) + return; + + write_lock_bh(&idev->lock); + + list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { + if ((ifp->flags & IFA_F_PERMANENT) && + fixup_permanent_addr(idev, ifp) < 0) { + write_unlock_bh(&idev->lock); + ipv6_del_addr(ifp); + write_lock_bh(&idev->lock); + + net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n", + idev->dev->name, &ifp->addr); + } + } + + write_unlock_bh(&idev->lock); +} + static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info; struct inet6_dev *idev = __in6_dev_get(dev); int run_pending = 0; int err; @@ -3220,6 +3278,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; if (event == NETDEV_UP) { + /* restore routes for permanent addresses */ + addrconf_permanent_addr(dev); + if (!addrconf_qdisc_ok(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", @@ -3327,6 +3388,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev) addrconf_type_change(dev, event); break; + + case NETDEV_CHANGEUPPER: + info = ptr; + + /* flush all routes if dev is linked to or unlinked from + * an L3 master device (e.g., VRF) + */ + if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + addrconf_ifdown(dev, 0); } return NOTIFY_OK; @@ -3352,11 +3422,20 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event) ipv6_mc_unmap(idev); } +static bool addr_is_local(const struct in6_addr *addr) +{ + return ipv6_addr_type(addr) & + (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); +} + static int addrconf_ifdown(struct net_device *dev, int how) { struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + struct inet6_ifaddr *ifa, *tmp; + struct list_head del_list; + int _keep_addr; + bool keep_addr; int state, i; ASSERT_RTNL(); @@ -3383,6 +3462,16 @@ static int addrconf_ifdown(struct net_device *dev, int how) } + /* aggregate the system setting and interface setting */ + _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + if (!_keep_addr) + _keep_addr = idev->cnf.keep_addr_on_down; + + /* combine the user config with event to determine if permanent + * addresses are to be removed from address hash table + */ + keep_addr = !(how || _keep_addr <= 0); + /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { struct hlist_head *h = &inet6_addr_lst[i]; @@ -3391,9 +3480,16 @@ static int addrconf_ifdown(struct net_device *dev, int how) restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { - hlist_del_init_rcu(&ifa->addr_lst); addrconf_del_dad_work(ifa); - goto restart; + /* combined flag + permanent flag decide if + * address is retained on a down event + */ + if (!keep_addr || + !(ifa->flags & IFA_F_PERMANENT) || + addr_is_local(&ifa->addr)) { + hlist_del_init_rcu(&ifa->addr_lst); + goto restart; + } } } spin_unlock_bh(&addrconf_hash_lock); @@ -3427,31 +3523,62 @@ restart: write_lock_bh(&idev->lock); } - while (!list_empty(&idev->addr_list)) { - ifa = list_first_entry(&idev->addr_list, - struct inet6_ifaddr, if_list); - addrconf_del_dad_work(ifa); + /* re-combine the user config with event to determine if permanent + * addresses are to be removed from the interface list + */ + keep_addr = (!how && _keep_addr > 0); - list_del(&ifa->if_list); + INIT_LIST_HEAD(&del_list); + list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + struct rt6_info *rt = NULL; - write_unlock_bh(&idev->lock); + addrconf_del_dad_work(ifa); + write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); - state = ifa->state; - ifa->state = INET6_IFADDR_STATE_DEAD; + + if (keep_addr && (ifa->flags & IFA_F_PERMANENT) && + !addr_is_local(&ifa->addr)) { + /* set state to skip the notifier below */ + state = INET6_IFADDR_STATE_DEAD; + ifa->state = 0; + if (!(ifa->flags & IFA_F_NODAD)) + ifa->flags |= IFA_F_TENTATIVE; + + rt = ifa->rt; + ifa->rt = NULL; + } else { + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + + list_del(&ifa->if_list); + list_add(&ifa->if_list, &del_list); + } + spin_unlock_bh(&ifa->lock); + if (rt) + ip6_del_rt(rt); + if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); } - in6_ifa_put(ifa); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); + /* now clean up addresses to be removed */ + while (!list_empty(&del_list)) { + ifa = list_first_entry(&del_list, + struct inet6_ifaddr, if_list); + list_del(&ifa->if_list); + + in6_ifa_put(ifa); + } + /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -4714,6 +4841,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; + array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; + array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; + array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; } static inline size_t inet6_ifla6_size(void) @@ -5195,10 +5325,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (rt) ip6_del_rt(rt); } - dst_hold(&ifp->rt->dst); - - ip6_del_rt(ifp->rt); - + if (ifp->rt) { + dst_hold(&ifp->rt->dst); + ip6_del_rt(ifp->rt); + } rt_genid_bump_ipv6(net); break; } @@ -5788,6 +5918,28 @@ static struct addrconf_sysctl_table .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { + .procname = "drop_unicast_in_l2_multicast", + .data = &ipv6_devconf.drop_unicast_in_l2_multicast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_unsolicited_na", + .data = &ipv6_devconf.drop_unsolicited_na, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "keep_addr_on_down", + .data = &ipv6_devconf.keep_addr_on_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, + { /* sentinel */ } }, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9f5137cd6..b11c37cfd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -235,7 +235,11 @@ lookup_protocol: * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 428162155..9dd3882fe 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -40,18 +40,114 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } +static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = sk->sk_protocol; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = np->saddr; + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_dport = inet->inet_dport; + fl6->fl6_sport = inet->inet_sport; + fl6->flowlabel = np->flow_label; + + if (!fl6->flowi6_oif) + fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + + if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) + fl6->flowi6_oif = np->mcast_oif; + + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); +} + +int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) +{ + struct ip6_flowlabel *flowlabel = NULL; + struct in6_addr *final_p, final; + struct ipv6_txoptions *opt; + struct dst_entry *dst; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 fl6; + int err = 0; + + if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) { + flowlabel = fl6_sock_lookup(sk, np->flow_label); + if (!flowlabel) + return -EINVAL; + } + ip6_datagram_flow_key_init(&fl6, sk); + + rcu_read_lock(); + opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); + final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; + } + + if (fix_sk_saddr) { + if (ipv6_addr_any(&np->saddr)) + np->saddr = fl6.saddr; + + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + sk->sk_v6_rcv_saddr = fl6.saddr; + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } + } + + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? + &sk->sk_v6_daddr : NULL, +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? + &np->saddr : +#endif + NULL); + +out: + fl6_sock_release(flowlabel); + return err; +} + +void ip6_datagram_release_cb(struct sock *sk) +{ + struct dst_entry *dst; + + if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return; + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || + dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + ip6_datagram_dst_update(sk, false); +} +EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); + static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr, *final_p, final; - struct dst_entry *dst; - struct flowi6 fl6; - struct ip6_flowlabel *flowlabel = NULL; - struct ipv6_txoptions *opt; + struct in6_addr *daddr; int addr_type; int err; + __be32 fl6_flowlabel = 0; if (usin->sin6_family == AF_INET) { if (__ipv6_only_sock(sk)) @@ -66,15 +162,8 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - memset(&fl6, 0, sizeof(fl6)); - if (np->sndflow) { - fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) - return -EINVAL; - } - } + if (np->sndflow) + fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; addr_type = ipv6_addr_type(&usin->sin6_addr); @@ -145,7 +234,7 @@ ipv4_connected: } sk->sk_v6_daddr = *daddr; - np->flow_label = fl6.flowlabel; + np->flow_label = fl6_flowlabel; inet->inet_dport = usin->sin6_port; @@ -154,59 +243,13 @@ ipv4_connected: * destination cache for it. */ - fl6.flowi6_proto = sk->sk_protocol; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = inet->inet_dport; - fl6.fl6_sport = inet->inet_sport; - - if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - - if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) - fl6.flowi6_oif = np->mcast_oif; - - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - - rcu_read_lock(); - opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); - final_p = fl6_update_dst(&fl6, opt, &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); - err = 0; - if (IS_ERR(dst)) { - err = PTR_ERR(dst); + err = ip6_datagram_dst_update(sk, true); + if (err) goto out; - } - - /* source address lookup done in ip6_dst_lookup */ - - if (ipv6_addr_any(&np->saddr)) - np->saddr = fl6.saddr; - - if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - sk->sk_v6_rcv_saddr = fl6.saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - if (sk->sk_prot->rehash) - sk->sk_prot->rehash(sk); - } - - ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, -#ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl6.saddr, &np->saddr) ? - &np->saddr : -#endif - NULL); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: - fl6_sock_release(flowlabel); return err; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 0a37ddc7a..0013cacf7 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -445,6 +445,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; + else + iif = l3mdev_master_ifindex(skb->dev); /* * Must not send error if the source does not uniquely @@ -499,9 +501,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - if (!fl6.flowi6_oif) - fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev); - dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 32dc9aab7..30613050e 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -99,5 +99,6 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); +MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 2ae3c4fd8..41f18de5d 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -120,8 +120,7 @@ nla_put_failure: static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { - /* No encapsulation overhead */ - return 0; + return nla_total_size(sizeof(u64)); /* ILA_ATTR_LOCATOR */ } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 36c3f0155..532c3ef28 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -26,6 +26,7 @@ #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> +#include <net/sock_reuseport.h> int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) @@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 21ace5a2b..70f2628be 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -17,11 +17,13 @@ #include <linux/module.h> #include <linux/random.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -121,7 +123,9 @@ static inline int compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { @@ -129,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net, const struct hlist_nulls_node *node; struct sock *result; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -146,6 +151,15 @@ begin: if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -163,11 +177,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -177,6 +193,7 @@ begin: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) @@ -184,7 +201,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sock *sk; local_bh_disable(); - sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + ntohs(dport), dif); local_bh_enable(); return sk; @@ -274,3 +292,59 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); + +int inet6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + local_bh_enable(); + } + + return 0; +} +EXPORT_SYMBOL_GPL(inet6_hash); + +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 9a4d7322f..b2025bf3d 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -6,8 +6,7 @@ #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, - __wsum csum) + __u32 len, __u8 proto, __wsum csum) { int carry; @@ -98,27 +97,16 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb, uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v6_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0c7e276c2..ea071fad6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -55,8 +55,6 @@ struct fib6_cleaner { void *arg; }; -static DEFINE_RWLOCK(fib6_walker_lock); - #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else @@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock); static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker *w); +static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* @@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w); static void fib6_gc_timer_cb(unsigned long arg); -static LIST_HEAD(fib6_walkers); -#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) +#define FOR_WALKERS(net, w) \ + list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) -static void fib6_walker_link(struct fib6_walker *w) +static void fib6_walker_link(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); - list_add(&w->lh, &fib6_walkers); - write_unlock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); + list_add(&w->lh, &net->ipv6.fib6_walkers); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } -static void fib6_walker_unlink(struct fib6_walker *w) +static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); - write_unlock_bh(&fib6_walker_lock); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) @@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w) static void fib6_dump_end(struct netlink_callback *cb) { + struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); @@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; @@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->skip = 0; read_lock_bh(&table->tb6_lock); - res = fib6_walk(w); + res = fib6_walk(net, w); read_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; @@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, res = fib6_walk_continue(w); read_unlock_bh(&table->tb6_lock); if (res <= 0) { - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); cb->args[4] = 0; } } @@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } #endif - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (!child) { if (w->root == fn) { w->root = w->node = NULL; @@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); node_free(fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) @@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, } /* Adjust walkers */ - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rt->dst.rt6_next; @@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, w->state = FWS_U; } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); rt->dst.rt6_next = NULL; @@ -1588,17 +1588,17 @@ skip: } } -static int fib6_walk(struct fib6_walker *w) +static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; - fib6_walker_link(w); + fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); return res; } @@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.arg = arg; c.net = net; - fib6_walk(&c.w); + fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, @@ -1725,14 +1725,15 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static struct fib6_gc_args +struct fib6_gc_args { int timeout; int more; -} gc_args; +}; static int fib6_age(struct rt6_info *rt, void *arg) { + struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -1748,10 +1749,10 @@ static int fib6_age(struct rt6_info *rt, void *arg) RT6_TRACE("expiring %p\n", rt); return -1; } - gc_args.more++; + gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { if (atomic_read(&rt->dst.__refcnt) == 0 && - time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; } else if (rt->rt6i_flags & RTF_GATEWAY) { @@ -1769,21 +1770,20 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } } - gc_args.more++; + gc_args->more++; } return 0; } -static DEFINE_SPINLOCK(fib6_gc_lock); - void fib6_run_gc(unsigned long expires, struct net *net, bool force) { + struct fib6_gc_args gc_args; unsigned long now; if (force) { - spin_lock_bh(&fib6_gc_lock); - } else if (!spin_trylock_bh(&fib6_gc_lock)) { + spin_lock_bh(&net->ipv6.fib6_gc_lock); + } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } @@ -1792,7 +1792,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) gc_args.more = icmp6_dst_gc(); - fib6_clean_all(net, fib6_age, NULL); + fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -1802,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); - spin_unlock_bh(&fib6_gc_lock); + spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(unsigned long arg) @@ -1814,6 +1814,9 @@ static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + spin_lock_init(&net->ipv6.fib6_gc_lock); + rwlock_init(&net->ipv6.fib6_walker_lock); + INIT_LIST_HEAD(&net->ipv6.fib6_walkers); setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); @@ -1974,7 +1977,8 @@ static int ipv6_route_yield(struct fib6_walker *w) return 0; } -static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, + struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; @@ -1984,7 +1988,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) iter->w.args = iter; iter->sernum = iter->w.root->fn_sernum; INIT_LIST_HEAD(&iter->w.lh); - fib6_walker_link(&iter->w); + fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, @@ -2045,16 +2049,16 @@ iter_table: ++*pos; return iter->w.leaf; } else if (r < 0) { - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); return NULL; } - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; - ipv6_route_seq_setup_walk(iter); + ipv6_route_seq_setup_walk(iter, net); goto iter_table; } @@ -2069,7 +2073,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos; if (iter->tbl) { - ipv6_route_seq_setup_walk(iter); + ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, pos); } else { return NULL; @@ -2085,10 +2089,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) static void ipv6_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { + struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); rcu_read_unlock_bh(); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c0d4dc1c5..4e636e60a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(tunnel); + dst = dst_cache_get(&tunnel->dst_cache); if (!dst) { dst = ip6_route_output(net, NULL, fl6); @@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(tunnel, ndst); + dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); proto = NEXTHDR_GRE; @@ -1011,7 +1011,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; } @@ -1221,7 +1221,7 @@ static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1259,7 +1259,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(tunnel); + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 9075acf08..c05c425c2 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); @@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) goto err; + /* If enabled, drop unicast packets that were encapsulated in link-layer + * multicast or broadcast to protected against the so-called "hole-196" + * attack in 802.11 wireless. + */ + if (!ipv6_addr_is_multicast(&hdr->daddr) && + (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) && + idev->cnf.drop_unicast_in_l2_multicast) + goto err; + /* RFC4291 2.7 * Nodes must not originate a packet to a multicast address whose scope * field contains the reserved value 0; if such a packet is received, it diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index eeca943f1..82e9f3076 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -258,6 +258,19 @@ out: return pp; } +static struct sk_buff **sit_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + if (NAPI_GRO_CB(skb)->encap_mark) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + NAPI_GRO_CB(skb)->encap_mark = 1; + + return ipv6_gro_receive(head, skb); +} + static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; @@ -302,7 +315,7 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { .gso_segment = ipv6_gso_segment, - .gro_receive = ipv6_gro_receive, + .gro_receive = sit_gro_receive, .gro_complete = sit_gro_complete, }, }; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2a6606c93..bc972e715 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -332,7 +332,6 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb_sender_cpu_clear(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3991b21e2..1f20345cb 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - write_seqlock_bh(&idst->lock); - dst_release(rcu_dereference_protected( - idst->dst, - lockdep_is_held(&idst->lock.lock))); - if (dst) { - dst_hold(dst); - idst->cookie = rt6_get_cookie((struct rt6_info *)dst); - } else { - idst->cookie = 0; - } - rcu_assign_pointer(idst->dst, dst); - write_sequnlock_bh(&idst->lock); -} - -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) -{ - struct ip6_tnl_dst *idst; - struct dst_entry *dst; - unsigned int seq; - u32 cookie; - - idst = raw_cpu_ptr(t->dst_cache); - - rcu_read_lock(); - do { - seq = read_seqbegin(&idst->lock); - dst = rcu_dereference(idst->dst); - cookie = idst->cookie; - } while (read_seqretry(&idst->lock, seq)); - - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - rcu_read_unlock(); - - if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { - ip6_tnl_per_cpu_dst_set(idst, NULL); - dst_release(dst); - dst = NULL; - } - return dst; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); - -void ip6_tnl_dst_reset(struct ip6_tnl *t) -{ - int i; - - for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); - -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) -{ - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); - -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); - -void ip6_tnl_dst_destroy(struct ip6_tnl *t) -{ - if (!t->dst_cache) - return; - - ip6_tnl_dst_reset(t); - free_percpu(t->dst_cache); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); - -int ip6_tnl_dst_init(struct ip6_tnl *t) -{ - int i; - - t->dst_cache = alloc_percpu(struct ip6_tnl_dst); - if (!t->dst_cache) - return -ENOMEM; - - for_each_possible_cpu(i) - seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); - - return 0; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); - /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(t); + dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(t, ndst); + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); skb->transport_header = skb->network_header; @@ -1368,7 +1277,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; } @@ -1639,7 +1548,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(t); + ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 14dacf1df..a7520528e 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck) + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); - ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0a8610b33..d90a11f14 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 84afb9a77..c245895a3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb) offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; @@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb) return; } + /* For some 802.11 wireless deployments (and possibly other networks), + * there will be a NA proxy and unsolicitd packets are attacks + * and thus should not be accepted. + */ + if (!msg->icmph.icmp6_solicited && idev && + idev->cnf.drop_unsolicited_na) + return; + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 99425cf28..86b67b70b 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -198,11 +198,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ip6t_ip6 *ipv6) +static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; - return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ip6t_entry) && + memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * @@ -258,11 +259,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ip6t_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ipv6)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] @@ -488,11 +488,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ip6t_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && - unconditional(&e->ipv6)) || visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -581,14 +580,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e) { const struct xt_entry_target *t; - if (!ip6_checkentry(&e->ipv6)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); + if (!ip6_checkentry(&e->ipv6)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -679,10 +676,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -733,7 +726,7 @@ static bool check_underflow(const struct ip6t_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ipv6)) + if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -753,9 +746,11 @@ check_entry_size_and_hooks(struct ip6t_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -767,6 +762,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -775,9 +774,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; @@ -1169,6 +1168,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1505,7 +1505,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1518,7 +1519,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ip6t_entry *)e, name); + ret = check_entry((struct ip6t_entry *)e); if (ret) return ret; @@ -1944,6 +1945,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); @@ -2071,9 +2073,28 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ip6t_register_table(struct net *net, - const struct xt_table *table, - const struct ip6t_replace *repl) +static void __ip6t_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ip6t_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ip6t_register_table(struct net *net, const struct xt_table *table, + const struct ip6t_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2082,10 +2103,8 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2099,30 +2118,28 @@ struct xt_table *ip6t_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ip6t_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ip6t_unregister_table(struct net *net, struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ip6t_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ip6t_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 8b277b983..1343077dd 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init ip6table_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, + .table_init = ip6table_filter_table_init, }; /* The work comes in here from netfilter.c. */ @@ -44,9 +47,13 @@ static struct nf_hook_ops *filter_ops __read_mostly; static bool forward = true; module_param(forward, bool, 0000); -static int __net_init ip6table_filter_net_init(struct net *net) +static int __net_init ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; + int err; + + if (net->ipv6.ip6table_filter) + return 0; repl = ip6t_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -55,15 +62,26 @@ static int __net_init ip6table_filter_net_init(struct net *net) ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv6.ip6table_filter = - ip6t_register_table(net, &packet_filter, repl); + err = ip6t_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv6.ip6table_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter); + return err; +} + +static int __net_init ip6table_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return ip6table_filter_table_init(net); + + return 0; } static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_filter); + if (!net->ipv6.ip6table_filter) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops); + net->ipv6.ip6table_filter = NULL; } static struct pernet_operations ip6table_filter_net_ops = { @@ -75,28 +93,21 @@ static int __init ip6table_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - goto cleanup_table; - } + kfree(filter_ops); return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_filter_net_ops); - return ret; } static void __exit ip6table_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&ip6table_filter_net_ops); + kfree(filter_ops); } module_init(ip6table_filter_init); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index abe278b07..cb2b28883 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init ip6table_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, + .table_init = ip6table_mangle_table_init, }; static unsigned int @@ -88,26 +91,33 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init ip6table_mangle_net_init(struct net *net) +static int __net_init ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_mangle) + return 0; repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_mangle = - ip6t_register_table(net, &packet_mangler, repl); + ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv6.ip6table_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle); + return ret; } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_mangle); + if (!net->ipv6.ip6table_mangle) + return; + + ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops); + net->ipv6.ip6table_mangle = NULL; } static struct pernet_operations ip6table_mangle_net_ops = { - .init = ip6table_mangle_net_init, .exit = ip6table_mangle_net_exit, }; @@ -115,28 +125,28 @@ static int __init ip6table_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); + ret = register_pernet_subsys(&ip6table_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; - - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); - goto cleanup_table; } - return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_mangle_net_ops); + ret = ip6table_mangle_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_mangle_net_ops); + kfree(mangle_ops); + } return ret; } static void __exit ip6table_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&ip6table_mangle_net_ops); + kfree(mangle_ops); } module_init(ip6table_mangle_init); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index de2a10a56..7d2bd9402 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -20,6 +20,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init ip6table_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv6_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV6, + .table_init = ip6table_nat_table_init, }; static unsigned int ip6table_nat_do_chain(void *priv, @@ -97,50 +100,50 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, }; -static int __net_init ip6table_nat_net_init(struct net *net) +static int __net_init ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_nat) + return 0; repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl); + ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, + nf_nat_ipv6_ops, &net->ipv6.ip6table_nat); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat); + return ret; } static void __net_exit ip6table_nat_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_nat); + if (!net->ipv6.ip6table_nat) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops); + net->ipv6.ip6table_nat = NULL; } static struct pernet_operations ip6table_nat_net_ops = { - .init = ip6table_nat_net_init, .exit = ip6table_nat_net_exit, }; static int __init ip6table_nat_init(void) { - int err; + int ret = register_pernet_subsys(&ip6table_nat_net_ops); - err = register_pernet_subsys(&ip6table_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&ip6table_nat_net_ops); -err1: - return err; + ret = ip6table_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&ip6table_nat_net_ops); + return ret; } static void __exit ip6table_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); unregister_pernet_subsys(&ip6table_nat_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 902196356..d4bc56443 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -9,12 +9,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init ip6table_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW, + .table_init = ip6table_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -27,26 +30,32 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init ip6table_raw_net_init(struct net *net) +static int __net_init ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_raw) + return 0; repl = ip6t_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_raw = - ip6t_register_table(net, &packet_raw, repl); + ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv6.ip6table_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw); + return ret; } static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_raw); + if (!net->ipv6.ip6table_raw) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops); + net->ipv6.ip6table_raw = NULL; } static struct pernet_operations ip6table_raw_net_ops = { - .init = ip6table_raw_net_init, .exit = ip6table_raw_net_exit, }; @@ -54,28 +63,29 @@ static int __init ip6table_raw_init(void) { int ret; + /* Register hooks */ + rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&ip6table_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; - - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); - goto cleanup_table; } - return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_raw_net_ops); + ret = ip6table_raw_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_raw_net_ops); + kfree(rawtable_ops); + } return ret; } static void __exit ip6table_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&ip6table_raw_net_ops); + kfree(rawtable_ops); } module_init(ip6table_raw_init); diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 0d856fedf..cf26ccb04 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init ip6table_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, + .table_init = ip6table_security_table_init, }; static unsigned int @@ -44,26 +47,32 @@ ip6table_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init ip6table_security_net_init(struct net *net) +static int __net_init ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_security) + return 0; repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_security = - ip6t_register_table(net, &security_table, repl); + ret = ip6t_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv6.ip6table_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security); + return ret; } static void __net_exit ip6table_security_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_security); + if (!net->ipv6.ip6table_security) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops); + net->ipv6.ip6table_security = NULL; } static struct pernet_operations ip6table_security_net_ops = { - .init = ip6table_security_net_init, .exit = ip6table_security_net_exit, }; @@ -71,27 +80,28 @@ static int __init ip6table_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&ip6table_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; - -cleanup_table: - unregister_pernet_subsys(&ip6table_security_net_ops); + ret = ip6table_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_security_net_ops); + kfree(sectbl_ops); + } return ret; } static void __exit ip6table_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&ip6table_security_net_ops); + kfree(sectbl_ops); } module_init(ip6table_security_init); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 6ce309928..e0be97e63 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -131,29 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt6i_flags & RTF_LOCAL) && - (!skb->dev || skb->dev->features & - (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - (data - (void *)skb->data); - skb->csum_offset = (void *)check - data; - *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - datalen, proto, 0); - } else { - *check = 0; - *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - datalen, proto, - csum_partial(data, datalen, - 0)); - if (proto == IPPROTO_UDP && !*check) - *check = CSUM_MANGLED_0; - } + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + + (data - (void *)skb->data); + skb->csum_offset = (void *)check - data; + *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, proto, 0); } else inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index cd1ac1637..9597ffb74 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 263a5164a..c382db7a2 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -26,35 +26,6 @@ #include <net/transp_v6.h> #include <net/ping.h> -struct proto pingv6_prot = { - .name = "PINGv6", - .owner = THIS_MODULE, - .init = ping_init_sock, - .close = ping_close, - .connect = ip6_datagram_connect_v6_only, - .disconnect = udp_disconnect, - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, - .sendmsg = ping_v6_sendmsg, - .recvmsg = ping_recvmsg, - .bind = ping_bind, - .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, - .unhash = ping_unhash, - .get_port = ping_get_port, - .obj_size = sizeof(struct raw6_sock), -}; -EXPORT_SYMBOL_GPL(pingv6_prot); - -static struct inet_protosw pingv6_protosw = { - .type = SOCK_DGRAM, - .protocol = IPPROTO_ICMPV6, - .prot = &pingv6_prot, - .ops = &inet6_dgram_ops, - .flags = INET_PROTOSW_REUSE, -}; - - /* Compatibility glue so we can support IPv6 when it's compiled as a module */ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) @@ -77,7 +48,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } -int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -192,6 +163,34 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return len; } +struct proto pingv6_prot = { + .name = "PINGv6", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip6_datagram_connect_v6_only, + .disconnect = udp_disconnect, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .sendmsg = ping_v6_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, + .obj_size = sizeof(struct raw6_sock), +}; +EXPORT_SYMBOL_GPL(pingv6_prot); + +static struct inet_protosw pingv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMPV6, + .prot = &pingv6_prot, + .ops = &inet6_dgram_ops, + .flags = INET_PROTOSW_REUSE, +}; + #ifdef CONFIG_PROC_FS static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos) { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 18f3498a6..e2ea31175 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -496,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + skb_postpush_rcsum(head, skb_network_header(head), + skb_network_header_len(head)); rcu_read_lock(); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ed4466392..6f32944e0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -338,9 +338,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } -static struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) { struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); @@ -364,6 +364,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net, return rt; } +EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { @@ -1417,8 +1418,20 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { + struct dst_entry *dst; + ip6_update_pmtu(skb, sock_net(sk), mtu, sk->sk_bound_dev_if, sk->sk_mark); + + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || + dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) + return; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + ip6_datagram_dst_update(sk, false); + bh_unlock_sock(sk); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); @@ -1737,6 +1750,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, } else { val = nla_get_u32(nla); } + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) goto err; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2066d1c25..83384308d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); dev_put(dev); } @@ -681,14 +681,16 @@ static int ipip6_rcv(struct sk_buff *skb) skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; - skb->protocol = htons(ETH_P_IPV6); + skb->dev = tunnel->dev; if (packet_is_spoofed(skb, iph, tunnel)) { tunnel->dev->stats.rx_errors++; goto out; } - __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), + !net_eq(tunnel->net, dev_net(tunnel->dev)))) + goto out; err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { @@ -740,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -911,7 +913,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT); if (IS_ERR(skb)) { ip_rt_put(rt); goto out; @@ -1000,7 +1002,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; @@ -1093,7 +1095,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) t->parms.link = p->link; ipip6_tunnel_bind_dev(t->dev); } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } @@ -1124,7 +1126,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } @@ -1278,7 +1280,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); break; @@ -1339,7 +1341,7 @@ static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1372,6 +1374,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + int err; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1382,10 +1385,10 @@ static int ipip6_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } return 0; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 2906ef207..aab91fa86 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -41,8 +41,7 @@ static __u16 const msstab[] = { 9000 - 60, }; -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv6_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) @@ -148,7 +147,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst; __u8 rcv_wscale; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 62e167ec1..31959f7fa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -67,7 +67,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/crypto.h> +#include <crypto/hash.h> #include <linux/scatterlist.h> static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); @@ -557,7 +557,8 @@ static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, bp->len = cpu_to_be32(nbytes); sg_init_one(&sg, bp, sizeof(*bp)); - return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + return crypto_ahash_update(hp->md5_req); } static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, @@ -565,14 +566,14 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, const struct tcphdr *th) { struct tcp_md5sig_pool *hp; - struct hash_desc *desc; + struct ahash_request *req; hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; - desc = &hp->md5_desc; + req = hp->md5_req; - if (crypto_hash_init(desc)) + if (crypto_ahash_init(req)) goto clear_hash; if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) goto clear_hash; @@ -580,7 +581,8 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; - if (crypto_hash_final(desc, md5_hash)) + ahash_request_set_crypt(req, NULL, md5_hash, 0); + if (crypto_ahash_final(req)) goto clear_hash; tcp_put_md5sig_pool(); @@ -600,7 +602,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, { const struct in6_addr *saddr, *daddr; struct tcp_md5sig_pool *hp; - struct hash_desc *desc; + struct ahash_request *req; const struct tcphdr *th = tcp_hdr(skb); if (sk) { /* valid for establish/request sockets */ @@ -615,9 +617,9 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; - desc = &hp->md5_desc; + req = hp->md5_req; - if (crypto_hash_init(desc)) + if (crypto_ahash_init(req)) goto clear_hash; if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) @@ -628,7 +630,8 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; - if (crypto_hash_final(desc, md5_hash)) + ahash_request_set_crypt(req, NULL, md5_hash, 0); + if (crypto_ahash_final(req)) goto clear_hash; tcp_put_md5sig_pool(); @@ -823,8 +826,13 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else + else { + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; + fl6.flowi6_oif = oif; + } + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -883,7 +891,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, &ipv6h->saddr, + &tcp_hashinfo, NULL, 0, + &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) @@ -1400,8 +1409,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, - inet6_iif(skb)); + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1466,7 +1475,7 @@ process: sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); - tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1525,6 +1534,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif(skb)); @@ -1890,7 +1900,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6794120f5..6bc5c664f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <net/addrconf.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> @@ -77,49 +78,6 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only - */ -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { - if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) - return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) - return match_wildcard; - } - return 0; - } - - if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} - static u32 udp6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) @@ -590,6 +548,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); struct sock *sk; + int harderr; int err; struct net *net = dev_net(skb->dev); @@ -601,26 +560,27 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } + harderr = icmpv6_err_convert(type, code, &err); + np = inet6_sk(sk); + if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); + if (np->pmtudisc != IPV6_PMTUDISC_DONT) + harderr = 1; } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); goto out; } - np = inet6_sk(sk); - - if (!icmpv6_err_convert(type, code, &err) && !np->recverr) - goto out; - - if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) - goto out; - - if (np->recverr) + if (!np->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + } sk->sk_err = err; sk->sk_error_report(sk); @@ -1579,6 +1539,7 @@ struct proto udpv6_prot = { .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, .backlog_rcv = __udpv6_queue_rcv_skb, + .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 7441e1e63..2b0fbe692 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -81,12 +81,18 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, csum = skb_checksum(skb, 0, skb->len, 0); uh->check = udp_v6_check(skb->len, &ipv6h->saddr, &ipv6h->daddr, csum); - if (uh->check == 0) uh->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Check if there is enough headroom to insert fragment header. */ tnl_hlen = skb_tnl_header_len(skb); if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index a4237707f..da126ee6d 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -287,14 +287,14 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, if (filp->f_flags & O_NONBLOCK) { /* nonblock mode is set */ - if (tty->termios.c_cflag & CBAUD) + if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); port->flags |= ASYNC_NORMAL_ACTIVE; pr_debug("%s(), O_NONBLOCK requested!\n", __func__); return 0; } - if (tty->termios.c_cflag & CLOCAL) { + if (C_CLOCAL(tty)) { pr_debug("%s(), doing CLOCAL!\n", __func__); do_clocal = 1; } @@ -806,7 +806,7 @@ static void ircomm_tty_throttle(struct tty_struct *tty) ircomm_tty_send_xchar(tty, STOP_CHAR(tty)); /* Hardware flow control? */ - if (tty->termios.c_cflag & CRTSCTS) { + if (C_CRTSCTS(tty)) { self->settings.dte &= ~IRCOMM_RTS; self->settings.dte |= IRCOMM_DELTA_RTS; @@ -831,12 +831,11 @@ static void ircomm_tty_unthrottle(struct tty_struct *tty) IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); /* Using software flow control? */ - if (I_IXOFF(tty)) { + if (I_IXOFF(tty)) ircomm_tty_send_xchar(tty, START_CHAR(tty)); - } /* Using hardware flow control? */ - if (tty->termios.c_cflag & CRTSCTS) { + if (C_CRTSCTS(tty)) { self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS); ircomm_param_request(self, IRCOMM_DTE, TRUE); @@ -1268,10 +1267,6 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m) seq_printf(m, "%cASYNC_LOW_LATENCY", sep); sep = '|'; } - if (self->port.flags & ASYNC_CLOSING) { - seq_printf(m, "%cASYNC_CLOSING", sep); - sep = '|'; - } if (self->port.flags & ASYNC_NORMAL_ACTIVE) { seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep); sep = '|'; diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c index 75ccdbd07..d3687aaa2 100644 --- a/net/irda/ircomm/ircomm_tty_ioctl.c +++ b/net/irda/ircomm/ircomm_tty_ioctl.c @@ -158,26 +158,21 @@ void ircomm_tty_set_termios(struct tty_struct *tty, ircomm_tty_change_speed(self, tty); /* Handle transition to B0 status */ - if ((old_termios->c_cflag & CBAUD) && - !(cflag & CBAUD)) { + if ((old_termios->c_cflag & CBAUD) && !(cflag & CBAUD)) { self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS); ircomm_param_request(self, IRCOMM_DTE, TRUE); } /* Handle transition away from B0 status */ - if (!(old_termios->c_cflag & CBAUD) && - (cflag & CBAUD)) { + if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) { self->settings.dte |= IRCOMM_DTR; - if (!(tty->termios.c_cflag & CRTSCTS) || - !test_bit(TTY_THROTTLED, &tty->flags)) { + if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags)) self->settings.dte |= IRCOMM_RTS; - } ircomm_param_request(self, IRCOMM_DTE, TRUE); } /* Handle turning off CRTSCTS */ - if ((old_termios->c_cflag & CRTSCTS) && - !(tty->termios.c_cflag & CRTSCTS)) + if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) { tty->hw_stopped = 0; ircomm_tty_start(tty); diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig new file mode 100644 index 000000000..5db94d940 --- /dev/null +++ b/net/kcm/Kconfig @@ -0,0 +1,10 @@ + +config AF_KCM + tristate "KCM sockets" + depends on INET + select BPF_SYSCALL + ---help--- + KCM (Kernel Connection Multiplexor) sockets provide a method + for multiplexing messages of a message based application + protocol over kernel connectons (e.g. TCP connections). + diff --git a/net/kcm/Makefile b/net/kcm/Makefile new file mode 100644 index 000000000..71256133e --- /dev/null +++ b/net/kcm/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_AF_KCM) += kcm.o + +kcm-y := kcmsock.o kcmproc.o diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c new file mode 100644 index 000000000..738008726 --- /dev/null +++ b/net/kcm/kcmproc.c @@ -0,0 +1,426 @@ +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/proc_fs.h> +#include <linux/rculist.h> +#include <linux/seq_file.h> +#include <linux/socket.h> +#include <net/inet_sock.h> +#include <net/kcm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/tcp.h> + +#ifdef CONFIG_PROC_FS +struct kcm_seq_muxinfo { + char *name; + const struct file_operations *seq_fops; + const struct seq_operations seq_ops; +}; + +static struct kcm_mux *kcm_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + + return list_first_or_null_rcu(&knet->mux_list, + struct kcm_mux, kcm_mux_list); +} + +static struct kcm_mux *kcm_get_next(struct kcm_mux *mux) +{ + struct kcm_net *knet = mux->knet; + + return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list, + struct kcm_mux, kcm_mux_list); +} + +static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + struct kcm_mux *m; + + list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) { + if (!pos) + return m; + --pos; + } + return NULL; +} + +static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *p; + + if (v == SEQ_START_TOKEN) + p = kcm_get_first(seq); + else + p = kcm_get_next(v); + ++*pos; + return p; +} + +static void *kcm_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + + if (!*pos) + return SEQ_START_TOKEN; + else + return kcm_get_idx(seq, *pos - 1); +} + +static void kcm_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +struct kcm_proc_mux_state { + struct seq_net_private p; + int idx; +}; + +static int kcm_seq_open(struct inode *inode, struct file *file) +{ + struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode); + int err; + + err = seq_open_net(inode, file, &muxinfo->seq_ops, + sizeof(struct kcm_proc_mux_state)); + if (err < 0) + return err; + return err; +} + +static void kcm_format_mux_header(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + + seq_printf(seq, + "*** KCM statistics (%d MUX) ****\n", + knet->count); + + seq_printf(seq, + "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s", + "Object", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "Recv-Q", + "Rmem", + "Send-Q", + "Smem", + "Status"); + + /* XXX: pdsts header stuff here */ + seq_puts(seq, "\n"); +} + +static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq, + int i, int *len) +{ + seq_printf(seq, + " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ", + kcm->index, + kcm->stats.rx_msgs, + kcm->stats.rx_bytes, + kcm->stats.tx_msgs, + kcm->stats.tx_bytes, + kcm->sk.sk_receive_queue.qlen, + sk_rmem_alloc_get(&kcm->sk), + kcm->sk.sk_write_queue.qlen, + "-"); + + if (kcm->tx_psock) + seq_printf(seq, "Psck-%u ", kcm->tx_psock->index); + + if (kcm->tx_wait) + seq_puts(seq, "TxWait "); + + if (kcm->tx_wait_more) + seq_puts(seq, "WMore "); + + if (kcm->rx_wait) + seq_puts(seq, "RxWait "); + + seq_puts(seq, "\n"); +} + +static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, + int i, int *len) +{ + seq_printf(seq, + " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", + psock->index, + psock->stats.rx_msgs, + psock->stats.rx_bytes, + psock->stats.tx_msgs, + psock->stats.tx_bytes, + psock->sk->sk_receive_queue.qlen, + atomic_read(&psock->sk->sk_rmem_alloc), + psock->sk->sk_write_queue.qlen, + atomic_read(&psock->sk->sk_wmem_alloc)); + + if (psock->done) + seq_puts(seq, "Done "); + + if (psock->tx_stopped) + seq_puts(seq, "TxStop "); + + if (psock->rx_stopped) + seq_puts(seq, "RxStop "); + + if (psock->tx_kcm) + seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); + + if (psock->ready_rx_msg) + seq_puts(seq, "RdyRx "); + + seq_puts(seq, "\n"); +} + +static void +kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq) +{ + int i, len; + struct kcm_sock *kcm; + struct kcm_psock *psock; + + /* mux information */ + seq_printf(seq, + "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ", + "mux", "", + mux->stats.rx_msgs, + mux->stats.rx_bytes, + mux->stats.tx_msgs, + mux->stats.tx_bytes, + "-", "-", "-", "-"); + + seq_printf(seq, "KCMs: %d, Psocks %d\n", + mux->kcm_socks_cnt, mux->psocks_cnt); + + /* kcm sock information */ + i = 0; + spin_lock_bh(&mux->lock); + list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) { + kcm_format_sock(kcm, seq, i, &len); + i++; + } + i = 0; + list_for_each_entry(psock, &mux->psocks, psock_list) { + kcm_format_psock(psock, seq, i, &len); + i++; + } + spin_unlock_bh(&mux->lock); +} + +static int kcm_seq_show(struct seq_file *seq, void *v) +{ + struct kcm_proc_mux_state *mux_state; + + mux_state = seq->private; + if (v == SEQ_START_TOKEN) { + mux_state->idx = 0; + kcm_format_mux_header(seq); + } else { + kcm_format_mux(v, mux_state->idx, seq); + mux_state->idx++; + } + return 0; +} + +static const struct file_operations kcm_seq_fops = { + .owner = THIS_MODULE, + .open = kcm_seq_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +static struct kcm_seq_muxinfo kcm_seq_muxinfo = { + .name = "kcm", + .seq_fops = &kcm_seq_fops, + .seq_ops = { + .show = kcm_seq_show, + .start = kcm_seq_start, + .next = kcm_seq_next, + .stop = kcm_seq_stop, + } +}; + +static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo) +{ + struct proc_dir_entry *p; + int rc = 0; + + p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net, + muxinfo->seq_fops, muxinfo); + if (!p) + rc = -ENOMEM; + return rc; +} +EXPORT_SYMBOL(kcm_proc_register); + +static void kcm_proc_unregister(struct net *net, + struct kcm_seq_muxinfo *muxinfo) +{ + remove_proc_entry(muxinfo->name, net->proc_net); +} +EXPORT_SYMBOL(kcm_proc_unregister); + +static int kcm_stats_seq_show(struct seq_file *seq, void *v) +{ + struct kcm_psock_stats psock_stats; + struct kcm_mux_stats mux_stats; + struct kcm_mux *mux; + struct kcm_psock *psock; + struct net *net = seq->private; + struct kcm_net *knet = net_generic(net, kcm_net_id); + + memset(&mux_stats, 0, sizeof(mux_stats)); + memset(&psock_stats, 0, sizeof(psock_stats)); + + mutex_lock(&knet->mutex); + + aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); + aggregate_psock_stats(&knet->aggregate_psock_stats, + &psock_stats); + + list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { + spin_lock_bh(&mux->lock); + aggregate_mux_stats(&mux->stats, &mux_stats); + aggregate_psock_stats(&mux->aggregate_psock_stats, + &psock_stats); + list_for_each_entry(psock, &mux->psocks, psock_list) + aggregate_psock_stats(&psock->stats, &psock_stats); + spin_unlock_bh(&mux->lock); + } + + mutex_unlock(&knet->mutex); + + seq_printf(seq, + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n", + "MUX", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "TX-Retries", + "Attach", + "Unattach", + "UnattchRsvd", + "RX-RdyDrops"); + + seq_printf(seq, + "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n", + "", + mux_stats.rx_msgs, + mux_stats.rx_bytes, + mux_stats.tx_msgs, + mux_stats.tx_bytes, + mux_stats.tx_retries, + mux_stats.psock_attach, + mux_stats.psock_unattach_rsvd, + mux_stats.psock_unattach, + mux_stats.rx_ready_drops); + + seq_printf(seq, + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", + "Psock", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "Reserved", + "Unreserved", + "RX-Aborts", + "RX-MemFail", + "RX-NeedMor", + "RX-BadLen", + "RX-TooBig", + "RX-Timeout", + "TX-Aborts"); + + seq_printf(seq, + "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", + "", + psock_stats.rx_msgs, + psock_stats.rx_bytes, + psock_stats.tx_msgs, + psock_stats.tx_bytes, + psock_stats.reserved, + psock_stats.unreserved, + psock_stats.rx_aborts, + psock_stats.rx_mem_fail, + psock_stats.rx_need_more_hdr, + psock_stats.rx_bad_hdr_len, + psock_stats.rx_msg_too_big, + psock_stats.rx_msg_timeouts, + psock_stats.tx_aborts); + + return 0; +} + +static int kcm_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, kcm_stats_seq_show); +} + +static const struct file_operations kcm_stats_seq_fops = { + .owner = THIS_MODULE, + .open = kcm_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int kcm_proc_init_net(struct net *net) +{ + int err; + + if (!proc_create("kcm_stats", S_IRUGO, net->proc_net, + &kcm_stats_seq_fops)) { + err = -ENOMEM; + goto out_kcm_stats; + } + + err = kcm_proc_register(net, &kcm_seq_muxinfo); + if (err) + goto out_kcm; + + return 0; + +out_kcm: + remove_proc_entry("kcm_stats", net->proc_net); +out_kcm_stats: + return err; +} + +static void kcm_proc_exit_net(struct net *net) +{ + kcm_proc_unregister(net, &kcm_seq_muxinfo); + remove_proc_entry("kcm_stats", net->proc_net); +} + +static struct pernet_operations kcm_net_ops = { + .init = kcm_proc_init_net, + .exit = kcm_proc_exit_net, +}; + +int __init kcm_proc_init(void) +{ + return register_pernet_subsys(&kcm_net_ops); +} + +void __exit kcm_proc_exit(void) +{ + unregister_pernet_subsys(&kcm_net_ops); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c new file mode 100644 index 000000000..40662d732 --- /dev/null +++ b/net/kcm/kcmsock.c @@ -0,0 +1,2409 @@ +#include <linux/bpf.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/uaccess.h> +#include <linux/workqueue.h> +#include <net/kcm.h> +#include <net/netns/generic.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <uapi/linux/kcm.h> + +unsigned int kcm_net_id; + +static struct kmem_cache *kcm_psockp __read_mostly; +static struct kmem_cache *kcm_muxp __read_mostly; +static struct workqueue_struct *kcm_wq; + +static inline struct kcm_sock *kcm_sk(const struct sock *sk) +{ + return (struct kcm_sock *)sk; +} + +static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) +{ + return (struct kcm_tx_msg *)skb->cb; +} + +static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) +{ + return (struct kcm_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +static void report_csk_error(struct sock *csk, int err) +{ + csk->sk_err = EPIPE; + csk->sk_error_report(csk); +} + +/* Callback lock held */ +static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, + struct sk_buff *skb) +{ + struct sock *csk = psock->sk; + + /* Unrecoverable error in receive */ + + del_timer(&psock->rx_msg_timer); + + if (psock->rx_stopped) + return; + + psock->rx_stopped = 1; + KCM_STATS_INCR(psock->stats.rx_aborts); + + /* Report an error on the lower socket */ + report_csk_error(csk, err); +} + +static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, + bool wakeup_kcm) +{ + struct sock *csk = psock->sk; + struct kcm_mux *mux = psock->mux; + + /* Unrecoverable error in transmit */ + + spin_lock_bh(&mux->lock); + + if (psock->tx_stopped) { + spin_unlock_bh(&mux->lock); + return; + } + + psock->tx_stopped = 1; + KCM_STATS_INCR(psock->stats.tx_aborts); + + if (!psock->tx_kcm) { + /* Take off psocks_avail list */ + list_del(&psock->psock_avail_list); + } else if (wakeup_kcm) { + /* In this case psock is being aborted while outside of + * write_msgs and psock is reserved. Schedule tx_work + * to handle the failure there. Need to commit tx_stopped + * before queuing work. + */ + smp_mb(); + + queue_work(kcm_wq, &psock->tx_kcm->tx_work); + } + + spin_unlock_bh(&mux->lock); + + /* Report error on lower socket */ + report_csk_error(csk, err); +} + +/* RX mux lock held. */ +static void kcm_update_rx_mux_stats(struct kcm_mux *mux, + struct kcm_psock *psock) +{ + KCM_STATS_ADD(mux->stats.rx_bytes, + psock->stats.rx_bytes - psock->saved_rx_bytes); + mux->stats.rx_msgs += + psock->stats.rx_msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->stats.rx_msgs; + psock->saved_rx_bytes = psock->stats.rx_bytes; +} + +static void kcm_update_tx_mux_stats(struct kcm_mux *mux, + struct kcm_psock *psock) +{ + KCM_STATS_ADD(mux->stats.tx_bytes, + psock->stats.tx_bytes - psock->saved_tx_bytes); + mux->stats.tx_msgs += + psock->stats.tx_msgs - psock->saved_tx_msgs; + psock->saved_tx_msgs = psock->stats.tx_msgs; + psock->saved_tx_bytes = psock->stats.tx_bytes; +} + +static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +/* KCM is ready to receive messages on its queue-- either the KCM is new or + * has become unblocked after being blocked on full socket buffer. Queue any + * pending ready messages on a psock. RX mux lock held. + */ +static void kcm_rcv_ready(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + struct sk_buff *skb; + + if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) + return; + + while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Assuming buffer limit has been reached */ + skb_queue_head(&mux->rx_hold_queue, skb); + WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); + return; + } + } + + while (!list_empty(&mux->psocks_ready)) { + psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, + psock_ready_list); + + if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { + /* Assuming buffer limit has been reached */ + WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); + return; + } + + /* Consumed the ready message on the psock. Schedule rx_work to + * get more messages. + */ + list_del(&psock->psock_ready_list); + psock->ready_rx_msg = NULL; + + /* Commit clearing of ready_rx_msg for queuing work */ + smp_mb(); + + queue_work(kcm_wq, &psock->rx_work); + } + + /* Buffer limit is okay now, add to ready list */ + list_add_tail(&kcm->wait_rx_list, + &kcm->mux->kcm_rx_waiters); + kcm->rx_wait = true; +} + +static void kcm_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct kcm_mux *mux = kcm->mux; + unsigned int len = skb->truesize; + + sk_mem_uncharge(sk, len); + atomic_sub(len, &sk->sk_rmem_alloc); + + /* For reading rx_wait and rx_psock without holding lock */ + smp_mb__after_atomic(); + + if (!kcm->rx_wait && !kcm->rx_psock && + sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { + spin_lock_bh(&mux->rx_lock); + kcm_rcv_ready(kcm); + spin_unlock_bh(&mux->rx_lock); + } +} + +static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff_head *list = &sk->sk_receive_queue; + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + return -ENOMEM; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return -ENOBUFS; + + skb->dev = NULL; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = kcm_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); + + skb_queue_tail(list, skb); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + + return 0; +} + +/* Requeue received messages for a kcm socket to other kcm sockets. This is + * called with a kcm socket is receive disabled. + * RX mux lock held. + */ +static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) +{ + struct sk_buff *skb; + struct kcm_sock *kcm; + + while ((skb = __skb_dequeue(head))) { + /* Reset destructor to avoid calling kcm_rcv_ready */ + skb->destructor = sock_rfree; + skb_orphan(skb); +try_again: + if (list_empty(&mux->kcm_rx_waiters)) { + skb_queue_tail(&mux->rx_hold_queue, skb); + continue; + } + + kcm = list_first_entry(&mux->kcm_rx_waiters, + struct kcm_sock, wait_rx_list); + + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Should mean socket buffer full */ + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + + /* Commit rx_wait to read in kcm_free */ + smp_wmb(); + + goto try_again; + } + } +} + +/* Lower sock lock held */ +static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, + struct sk_buff *head) +{ + struct kcm_mux *mux = psock->mux; + struct kcm_sock *kcm; + + WARN_ON(psock->ready_rx_msg); + + if (psock->rx_kcm) + return psock->rx_kcm; + + spin_lock_bh(&mux->rx_lock); + + if (psock->rx_kcm) { + spin_unlock_bh(&mux->rx_lock); + return psock->rx_kcm; + } + + kcm_update_rx_mux_stats(mux, psock); + + if (list_empty(&mux->kcm_rx_waiters)) { + psock->ready_rx_msg = head; + list_add_tail(&psock->psock_ready_list, + &mux->psocks_ready); + spin_unlock_bh(&mux->rx_lock); + return NULL; + } + + kcm = list_first_entry(&mux->kcm_rx_waiters, + struct kcm_sock, wait_rx_list); + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + + psock->rx_kcm = kcm; + kcm->rx_psock = psock; + + spin_unlock_bh(&mux->rx_lock); + + return kcm; +} + +static void kcm_done(struct kcm_sock *kcm); + +static void kcm_done_work(struct work_struct *w) +{ + kcm_done(container_of(w, struct kcm_sock, done_work)); +} + +/* Lower sock held */ +static void unreserve_rx_kcm(struct kcm_psock *psock, + bool rcv_ready) +{ + struct kcm_sock *kcm = psock->rx_kcm; + struct kcm_mux *mux = psock->mux; + + if (!kcm) + return; + + spin_lock_bh(&mux->rx_lock); + + psock->rx_kcm = NULL; + kcm->rx_psock = NULL; + + /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with + * kcm_rfree + */ + smp_mb(); + + if (unlikely(kcm->done)) { + spin_unlock_bh(&mux->rx_lock); + + /* Need to run kcm_done in a task since we need to qcquire + * callback locks which may already be held here. + */ + INIT_WORK(&kcm->done_work, kcm_done_work); + schedule_work(&kcm->done_work); + return; + } + + if (unlikely(kcm->rx_disabled)) { + requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); + } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { + /* Check for degenerative race with rx_wait that all + * data was dequeued (accounted for in kcm_rfree). + */ + kcm_rcv_ready(kcm); + } + spin_unlock_bh(&mux->rx_lock); +} + +static void kcm_start_rx_timer(struct kcm_psock *psock) +{ + if (psock->sk->sk_rcvtimeo) + mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); +} + +/* Macro to invoke filter function. */ +#define KCM_RUN_FILTER(prog, ctx) \ + (*prog->bpf_func)(ctx, prog->insnsi) + +/* Lower socket lock held */ +static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; + struct kcm_rx_msg *rxm; + struct kcm_sock *kcm; + struct sk_buff *head, *skb; + size_t eaten = 0, cand_len; + ssize_t extra; + int err; + bool cloned_orig = false; + + if (psock->ready_rx_msg) + return 0; + + head = psock->rx_skb_head; + if (head) { + /* Message already in progress */ + + rxm = kcm_rx_msg(head); + if (unlikely(rxm->early_eaten)) { + /* Already some number of bytes on the receive sock + * data saved in rx_skb_head, just indicate they + * are consumed. + */ + eaten = orig_len <= rxm->early_eaten ? + orig_len : rxm->early_eaten; + rxm->early_eaten -= eaten; + + return eaten; + } + + if (unlikely(orig_offset)) { + /* Getting data with a non-zero offset when a message is + * in progress is not expected. If it does happen, we + * need to clone and pull since we can't deal with + * offsets in the skbs for a message expect in the head. + */ + orig_skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!orig_skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + if (!pskb_pull(orig_skb, orig_offset)) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + kfree_skb(orig_skb); + desc->error = -ENOMEM; + return 0; + } + cloned_orig = true; + orig_offset = 0; + } + + if (!psock->rx_skb_nextp) { + /* We are going to append to the frags_list of head. + * Need to unshare the frag_list. + */ + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = err; + return 0; + } + + if (unlikely(skb_shinfo(head)->frag_list)) { + /* We can't append to an sk_buff that already + * has a frag_list. We create a new head, point + * the frag_list of that to the old head, and + * then are able to use the old head->next for + * appending to the message. + */ + if (WARN_ON(head->next)) { + desc->error = -EINVAL; + return 0; + } + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + skb->len = head->len; + skb->data_len = head->len; + skb->truesize = head->truesize; + *kcm_rx_msg(skb) = *kcm_rx_msg(head); + psock->rx_skb_nextp = &head->next; + skb_shinfo(skb)->frag_list = head; + psock->rx_skb_head = skb; + head = skb; + } else { + psock->rx_skb_nextp = + &skb_shinfo(head)->frag_list; + } + } + } + + while (eaten < orig_len) { + /* Always clone since we will consume something */ + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + break; + } + + cand_len = orig_len - eaten; + + head = psock->rx_skb_head; + if (!head) { + head = skb; + psock->rx_skb_head = head; + /* Will set rx_skb_nextp on next packet if needed */ + psock->rx_skb_nextp = NULL; + rxm = kcm_rx_msg(head); + memset(rxm, 0, sizeof(*rxm)); + rxm->offset = orig_offset + eaten; + } else { + /* Unclone since we may be appending to an skb that we + * already share a frag_list with. + */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = err; + break; + } + + rxm = kcm_rx_msg(head); + *psock->rx_skb_nextp = skb; + psock->rx_skb_nextp = &skb->next; + head->data_len += skb->len; + head->len += skb->len; + head->truesize += skb->truesize; + } + + if (!rxm->full_len) { + ssize_t len; + + len = KCM_RUN_FILTER(psock->bpf_prog, head); + + if (!len) { + /* Need more header to determine length */ + if (!rxm->accum_len) { + /* Start RX timer for new message */ + kcm_start_rx_timer(psock); + } + rxm->accum_len += cand_len; + eaten += cand_len; + KCM_STATS_INCR(psock->stats.rx_need_more_hdr); + WARN_ON(eaten != orig_len); + break; + } else if (len > psock->sk->sk_rcvbuf) { + /* Message length exceeds maximum allowed */ + KCM_STATS_INCR(psock->stats.rx_msg_too_big); + desc->error = -EMSGSIZE; + psock->rx_skb_head = NULL; + kcm_abort_rx_psock(psock, EMSGSIZE, head); + break; + } else if (len <= (ssize_t)head->len - + skb->len - rxm->offset) { + /* Length must be into new skb (and also + * greater than zero) + */ + KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); + desc->error = -EPROTO; + psock->rx_skb_head = NULL; + kcm_abort_rx_psock(psock, EPROTO, head); + break; + } + + rxm->full_len = len; + } + + extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; + + if (extra < 0) { + /* Message not complete yet. */ + if (rxm->full_len - rxm->accum_len > + tcp_inq(psock->sk)) { + /* Don't have the whole messages in the socket + * buffer. Set psock->rx_need_bytes to wait for + * the rest of the message. Also, set "early + * eaten" since we've already buffered the skb + * but don't consume yet per tcp_read_sock. + */ + + if (!rxm->accum_len) { + /* Start RX timer for new message */ + kcm_start_rx_timer(psock); + } + + psock->rx_need_bytes = rxm->full_len - + rxm->accum_len; + rxm->accum_len += cand_len; + rxm->early_eaten = cand_len; + KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); + desc->count = 0; /* Stop reading socket */ + break; + } + rxm->accum_len += cand_len; + eaten += cand_len; + WARN_ON(eaten != orig_len); + break; + } + + /* Positive extra indicates ore bytes than needed for the + * message + */ + + WARN_ON(extra > cand_len); + + eaten += (cand_len - extra); + + /* Hurray, we have a new message! */ + del_timer(&psock->rx_msg_timer); + psock->rx_skb_head = NULL; + KCM_STATS_INCR(psock->stats.rx_msgs); + +try_queue: + kcm = reserve_rx_kcm(psock, head); + if (!kcm) { + /* Unable to reserve a KCM, message is held in psock. */ + break; + } + + if (kcm_queue_rcv_skb(&kcm->sk, head)) { + /* Should mean socket buffer full */ + unreserve_rx_kcm(psock, false); + goto try_queue; + } + } + + if (cloned_orig) + kfree_skb(orig_skb); + + KCM_STATS_ADD(psock->stats.rx_bytes, eaten); + + return eaten; +} + +/* Called with lock held on lower socket */ +static int psock_tcp_read_sock(struct kcm_psock *psock) +{ + read_descriptor_t desc; + + desc.arg.data = psock; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + /* sk should be locked here, so okay to do tcp_read_sock */ + tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); + + unreserve_rx_kcm(psock, true); + + return desc.error; +} + +/* Lower sock lock held */ +static void psock_tcp_data_ready(struct sock *sk) +{ + struct kcm_psock *psock; + + read_lock_bh(&sk->sk_callback_lock); + + psock = (struct kcm_psock *)sk->sk_user_data; + if (unlikely(!psock || psock->rx_stopped)) + goto out; + + if (psock->ready_rx_msg) + goto out; + + if (psock->rx_need_bytes) { + if (tcp_inq(sk) >= psock->rx_need_bytes) + psock->rx_need_bytes = 0; + else + goto out; + } + + if (psock_tcp_read_sock(psock) == -ENOMEM) + queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void do_psock_rx_work(struct kcm_psock *psock) +{ + read_descriptor_t rd_desc; + struct sock *csk = psock->sk; + + /* We need the read lock to synchronize with psock_tcp_data_ready. We + * need the socket lock for calling tcp_read_sock. + */ + lock_sock(csk); + read_lock_bh(&csk->sk_callback_lock); + + if (unlikely(csk->sk_user_data != psock)) + goto out; + + if (unlikely(psock->rx_stopped)) + goto out; + + if (psock->ready_rx_msg) + goto out; + + rd_desc.arg.data = psock; + + if (psock_tcp_read_sock(psock) == -ENOMEM) + queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + +out: + read_unlock_bh(&csk->sk_callback_lock); + release_sock(csk); +} + +static void psock_rx_work(struct work_struct *w) +{ + do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); +} + +static void psock_rx_delayed_work(struct work_struct *w) +{ + do_psock_rx_work(container_of(w, struct kcm_psock, + rx_delayed_work.work)); +} + +static void psock_tcp_state_change(struct sock *sk) +{ + /* TCP only does a POLLIN for a half close. Do a POLLHUP here + * since application will normally not poll with POLLIN + * on the TCP sockets. + */ + + report_csk_error(sk, EPIPE); +} + +static void psock_tcp_write_space(struct sock *sk) +{ + struct kcm_psock *psock; + struct kcm_mux *mux; + struct kcm_sock *kcm; + + read_lock_bh(&sk->sk_callback_lock); + + psock = (struct kcm_psock *)sk->sk_user_data; + if (unlikely(!psock)) + goto out; + + mux = psock->mux; + + spin_lock_bh(&mux->lock); + + /* Check if the socket is reserved so someone is waiting for sending. */ + kcm = psock->tx_kcm; + if (kcm) + queue_work(kcm_wq, &kcm->tx_work); + + spin_unlock_bh(&mux->lock); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void unreserve_psock(struct kcm_sock *kcm); + +/* kcm sock is locked. */ +static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + + psock = kcm->tx_psock; + + smp_rmb(); /* Must read tx_psock before tx_wait */ + + if (psock) { + WARN_ON(kcm->tx_wait); + if (unlikely(psock->tx_stopped)) + unreserve_psock(kcm); + else + return kcm->tx_psock; + } + + spin_lock_bh(&mux->lock); + + /* Check again under lock to see if psock was reserved for this + * psock via psock_unreserve. + */ + psock = kcm->tx_psock; + if (unlikely(psock)) { + WARN_ON(kcm->tx_wait); + spin_unlock_bh(&mux->lock); + return kcm->tx_psock; + } + + if (!list_empty(&mux->psocks_avail)) { + psock = list_first_entry(&mux->psocks_avail, + struct kcm_psock, + psock_avail_list); + list_del(&psock->psock_avail_list); + if (kcm->tx_wait) { + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + } + kcm->tx_psock = psock; + psock->tx_kcm = kcm; + KCM_STATS_INCR(psock->stats.reserved); + } else if (!kcm->tx_wait) { + list_add_tail(&kcm->wait_psock_list, + &mux->kcm_tx_waiters); + kcm->tx_wait = true; + } + + spin_unlock_bh(&mux->lock); + + return psock; +} + +/* mux lock held */ +static void psock_now_avail(struct kcm_psock *psock) +{ + struct kcm_mux *mux = psock->mux; + struct kcm_sock *kcm; + + if (list_empty(&mux->kcm_tx_waiters)) { + list_add_tail(&psock->psock_avail_list, + &mux->psocks_avail); + } else { + kcm = list_first_entry(&mux->kcm_tx_waiters, + struct kcm_sock, + wait_psock_list); + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + psock->tx_kcm = kcm; + + /* Commit before changing tx_psock since that is read in + * reserve_psock before queuing work. + */ + smp_mb(); + + kcm->tx_psock = psock; + KCM_STATS_INCR(psock->stats.reserved); + queue_work(kcm_wq, &kcm->tx_work); + } +} + +/* kcm sock is locked. */ +static void unreserve_psock(struct kcm_sock *kcm) +{ + struct kcm_psock *psock; + struct kcm_mux *mux = kcm->mux; + + spin_lock_bh(&mux->lock); + + psock = kcm->tx_psock; + + if (WARN_ON(!psock)) { + spin_unlock_bh(&mux->lock); + return; + } + + smp_rmb(); /* Read tx_psock before tx_wait */ + + kcm_update_tx_mux_stats(mux, psock); + + WARN_ON(kcm->tx_wait); + + kcm->tx_psock = NULL; + psock->tx_kcm = NULL; + KCM_STATS_INCR(psock->stats.unreserved); + + if (unlikely(psock->tx_stopped)) { + if (psock->done) { + /* Deferred free */ + list_del(&psock->psock_list); + mux->psocks_cnt--; + sock_put(psock->sk); + fput(psock->sk->sk_socket->file); + kmem_cache_free(kcm_psockp, psock); + } + + /* Don't put back on available list */ + + spin_unlock_bh(&mux->lock); + + return; + } + + psock_now_avail(psock); + + spin_unlock_bh(&mux->lock); +} + +static void kcm_report_tx_retry(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + spin_lock_bh(&mux->lock); + KCM_STATS_INCR(mux->stats.tx_retries); + spin_unlock_bh(&mux->lock); +} + +/* Write any messages ready on the kcm socket. Called with kcm sock lock + * held. Return bytes actually sent or error. + */ +static int kcm_write_msgs(struct kcm_sock *kcm) +{ + struct sock *sk = &kcm->sk; + struct kcm_psock *psock; + struct sk_buff *skb, *head; + struct kcm_tx_msg *txm; + unsigned short fragidx, frag_offset; + unsigned int sent, total_sent = 0; + int ret = 0; + + kcm->tx_wait_more = false; + psock = kcm->tx_psock; + if (unlikely(psock && psock->tx_stopped)) { + /* A reserved psock was aborted asynchronously. Unreserve + * it and we'll retry the message. + */ + unreserve_psock(kcm); + kcm_report_tx_retry(kcm); + if (skb_queue_empty(&sk->sk_write_queue)) + return 0; + + kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; + + } else if (skb_queue_empty(&sk->sk_write_queue)) { + return 0; + } + + head = skb_peek(&sk->sk_write_queue); + txm = kcm_tx_msg(head); + + if (txm->sent) { + /* Send of first skbuff in queue already in progress */ + if (WARN_ON(!psock)) { + ret = -EINVAL; + goto out; + } + sent = txm->sent; + frag_offset = txm->frag_offset; + fragidx = txm->fragidx; + skb = txm->frag_skb; + + goto do_frag; + } + +try_again: + psock = reserve_psock(kcm); + if (!psock) + goto out; + + do { + skb = head; + txm = kcm_tx_msg(head); + sent = 0; + +do_frag_list: + if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { + ret = -EINVAL; + goto out; + } + + for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; + fragidx++) { + skb_frag_t *frag; + + frag_offset = 0; +do_frag: + frag = &skb_shinfo(skb)->frags[fragidx]; + if (WARN_ON(!frag->size)) { + ret = -EINVAL; + goto out; + } + + ret = kernel_sendpage(psock->sk->sk_socket, + frag->page.p, + frag->page_offset + frag_offset, + frag->size - frag_offset, + MSG_DONTWAIT); + if (ret <= 0) { + if (ret == -EAGAIN) { + /* Save state to try again when there's + * write space on the socket + */ + txm->sent = sent; + txm->frag_offset = frag_offset; + txm->fragidx = fragidx; + txm->frag_skb = skb; + + ret = 0; + goto out; + } + + /* Hard failure in sending message, abort this + * psock since it has lost framing + * synchonization and retry sending the + * message from the beginning. + */ + kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, + true); + unreserve_psock(kcm); + + txm->sent = 0; + kcm_report_tx_retry(kcm); + ret = 0; + + goto try_again; + } + + sent += ret; + frag_offset += ret; + KCM_STATS_ADD(psock->stats.tx_bytes, ret); + if (frag_offset < frag->size) { + /* Not finished with this frag */ + goto do_frag; + } + } + + if (skb == head) { + if (skb_has_frag_list(skb)) { + skb = skb_shinfo(skb)->frag_list; + goto do_frag_list; + } + } else if (skb->next) { + skb = skb->next; + goto do_frag_list; + } + + /* Successfully sent the whole packet, account for it. */ + skb_dequeue(&sk->sk_write_queue); + kfree_skb(head); + sk->sk_wmem_queued -= sent; + total_sent += sent; + KCM_STATS_INCR(psock->stats.tx_msgs); + } while ((head = skb_peek(&sk->sk_write_queue))); +out: + if (!head) { + /* Done with all queued messages. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + unreserve_psock(kcm); + } + + /* Check if write space is available */ + sk->sk_write_space(sk); + + return total_sent ? : ret; +} + +static void kcm_tx_work(struct work_struct *w) +{ + struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); + struct sock *sk = &kcm->sk; + int err; + + lock_sock(sk); + + /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx + * aborts + */ + err = kcm_write_msgs(kcm); + if (err < 0) { + /* Hard failure in write, report error on KCM socket */ + pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); + report_csk_error(&kcm->sk, -err); + goto out; + } + + /* Primarily for SOCK_SEQPACKET sockets */ + if (likely(sk->sk_socket) && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_space(sk); + } + +out: + release_sock(sk); +} + +static void kcm_push(struct kcm_sock *kcm) +{ + if (kcm->tx_wait_more) + kcm_write_msgs(kcm); +} + +static ssize_t kcm_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) + +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct sk_buff *skb = NULL, *head = NULL; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + bool eor; + int err = 0; + int i; + + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + + /* No MSG_EOR from splice, only look at MSG_MORE */ + eor = !(flags & MSG_MORE); + + lock_sock(sk); + + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + err = -EPIPE; + if (sk->sk_err) + goto out_error; + + if (kcm->seq_skb) { + /* Previously opened message */ + head = kcm->seq_skb; + skb = kcm_tx_msg(head)->last_skb; + i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + goto coalesced; + } + + if (i >= MAX_SKB_FRAGS) { + struct sk_buff *tskb; + + tskb = alloc_skb(0, sk->sk_allocation); + while (!tskb) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + if (head == skb) + skb_shinfo(head)->frag_list = tskb; + else + skb->next = tskb; + + skb = tskb; + skb->ip_summed = CHECKSUM_UNNECESSARY; + i = 0; + } + } else { + /* Call the sk_stream functions to manage the sndbuf mem. */ + if (!sk_stream_memory_free(sk)) { + kcm_push(kcm); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + head = alloc_skb(0, sk->sk_allocation); + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + skb = head; + i = 0; + } + + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + +coalesced: + skb->len += size; + skb->data_len += size; + skb->truesize += size; + sk->sk_wmem_queued += size; + sk_mem_charge(sk, size); + + if (head != skb) { + head->len += size; + head->data_len += size; + head->truesize += size; + } + + if (eor) { + bool not_busy = skb_queue_empty(&sk->sk_write_queue); + + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + + if (flags & MSG_BATCH) { + kcm->tx_wait_more = true; + } else if (kcm->tx_wait_more || not_busy) { + err = kcm_write_msgs(kcm); + if (err < 0) { + /* We got a hard error in write_msgs but have + * already queued this message. Report an error + * in the socket, but don't affect return value + * from sendmsg + */ + pr_warn("KCM: Hard failure on kcm_write_msgs\n"); + report_csk_error(&kcm->sk, -err); + } + } + } else { + /* Message not complete, save state */ + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } + + KCM_STATS_ADD(kcm->stats.tx_bytes, size); + + release_sock(sk); + return size; + +out_error: + kcm_push(kcm); + + err = sk_stream_error(sk, flags, err); + + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + + release_sock(sk); + return err; +} + +static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct sk_buff *skb = NULL, *head = NULL; + size_t copy, copied = 0; + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + int eor = (sock->type == SOCK_DGRAM) ? + !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); + int err = -EPIPE; + + lock_sock(sk); + + /* Per tcp_sendmsg this should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err) + goto out_error; + + if (kcm->seq_skb) { + /* Previously opened message */ + head = kcm->seq_skb; + skb = kcm_tx_msg(head)->last_skb; + goto start; + } + + /* Call the sk_stream functions to manage the sndbuf mem. */ + if (!sk_stream_memory_free(sk)) { + kcm_push(kcm); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + /* New message, alloc head skb */ + head = alloc_skb(0, sk->sk_allocation); + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + + head = alloc_skb(0, sk->sk_allocation); + } + + skb = head; + + /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling + * csum_and_copy_from_iter from skb_do_copy_data_nocache. + */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + +start: + while (msg_data_left(msg)) { + bool merge = true; + int i = skb_shinfo(skb)->nr_frags; + struct page_frag *pfrag = sk_page_frag(sk); + + if (!sk_page_frag_refill(sk, pfrag)) + goto wait_for_memory; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + if (i == MAX_SKB_FRAGS) { + struct sk_buff *tskb; + + tskb = alloc_skb(0, sk->sk_allocation); + if (!tskb) + goto wait_for_memory; + + if (head == skb) + skb_shinfo(head)->frag_list = tskb; + else + skb->next = tskb; + + skb = tskb; + skb->ip_summed = CHECKSUM_UNNECESSARY; + continue; + } + merge = false; + } + + copy = min_t(int, msg_data_left(msg), + pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto out_error; + + /* Update the skb. */ + if (merge) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); + } + + pfrag->offset += copy; + copied += copy; + if (head != skb) { + head->len += copy; + head->data_len += copy; + } + + continue; + +wait_for_memory: + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + if (eor) { + bool not_busy = skb_queue_empty(&sk->sk_write_queue); + + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + + if (msg->msg_flags & MSG_BATCH) { + kcm->tx_wait_more = true; + } else if (kcm->tx_wait_more || not_busy) { + err = kcm_write_msgs(kcm); + if (err < 0) { + /* We got a hard error in write_msgs but have + * already queued this message. Report an error + * in the socket, but don't affect return value + * from sendmsg + */ + pr_warn("KCM: Hard failure on kcm_write_msgs\n"); + report_csk_error(&kcm->sk, -err); + } + } + } else { + /* Message not complete, save state */ +partial_message: + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } + + KCM_STATS_ADD(kcm->stats.tx_bytes, copied); + + release_sock(sk); + return copied; + +out_error: + kcm_push(kcm); + + if (copied && sock->type == SOCK_SEQPACKET) { + /* Wrote some bytes before encountering an + * error, return partial success. + */ + goto partial_message; + } + + if (head != kcm->seq_skb) + kfree_skb(head); + + err = sk_stream_error(sk, msg->msg_flags, err); + + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + + release_sock(sk); + return err; +} + +static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct sk_buff *skb; + + while (!(skb = skb_peek(&sk->sk_receive_queue))) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + sk_wait_data(sk, &timeo, NULL); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + int err = 0; + long timeo; + struct kcm_rx_msg *rxm; + int copied = 0; + struct sk_buff *skb; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + skb = kcm_wait_data(sk, flags, timeo, &err); + if (!skb) + goto out; + + /* Okay, have a message on the receive queue */ + + rxm = kcm_rx_msg(skb); + + if (len > rxm->full_len) + len = rxm->full_len; + + err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); + if (err < 0) + goto out; + + copied = len; + if (likely(!(flags & MSG_PEEK))) { + KCM_STATS_ADD(kcm->stats.rx_bytes, copied); + if (copied < rxm->full_len) { + if (sock->type == SOCK_DGRAM) { + /* Truncated message */ + msg->msg_flags |= MSG_TRUNC; + goto msg_finished; + } + rxm->offset += copied; + rxm->full_len -= copied; + } else { +msg_finished: + /* Finished with message */ + msg->msg_flags |= MSG_EOR; + KCM_STATS_INCR(kcm->stats.rx_msgs); + skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); + } + } + +out: + release_sock(sk); + + return copied ? : err; +} + +static ssize_t kcm_sock_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + +static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + long timeo; + struct kcm_rx_msg *rxm; + int err = 0; + size_t copied; + struct sk_buff *skb; + + /* Only support splice for SOCKSEQPACKET */ + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + skb = kcm_wait_data(sk, flags, timeo, &err); + if (!skb) + goto err_out; + + /* Okay, have a message on the receive queue */ + + rxm = kcm_rx_msg(skb); + + if (len > rxm->full_len) + len = rxm->full_len; + + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, + kcm_sock_splice); + if (copied < 0) { + err = copied; + goto err_out; + } + + KCM_STATS_ADD(kcm->stats.rx_bytes, copied); + + rxm->offset += copied; + rxm->full_len -= copied; + + /* We have no way to return MSG_EOR. If all the bytes have been + * read we still leave the message in the receive socket buffer. + * A subsequent recvmsg needs to be done to return MSG_EOR and + * finish reading the message. + */ + + release_sock(sk); + + return copied; + +err_out: + release_sock(sk); + + return err; +} + +/* kcm sock lock held */ +static void kcm_recv_disable(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + if (kcm->rx_disabled) + return; + + spin_lock_bh(&mux->rx_lock); + + kcm->rx_disabled = 1; + + /* If a psock is reserved we'll do cleanup in unreserve */ + if (!kcm->rx_psock) { + if (kcm->rx_wait) { + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + } + + requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); + } + + spin_unlock_bh(&mux->rx_lock); +} + +/* kcm sock lock held */ +static void kcm_recv_enable(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + if (!kcm->rx_disabled) + return; + + spin_lock_bh(&mux->rx_lock); + + kcm->rx_disabled = 0; + kcm_rcv_ready(kcm); + + spin_unlock_bh(&mux->rx_lock); +} + +static int kcm_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + int val, valbool; + int err = 0; + + if (level != SOL_KCM) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EINVAL; + + valbool = val ? 1 : 0; + + switch (optname) { + case KCM_RECV_DISABLE: + lock_sock(&kcm->sk); + if (valbool) + kcm_recv_disable(kcm); + else + kcm_recv_enable(kcm); + release_sock(&kcm->sk); + break; + default: + err = -ENOPROTOOPT; + } + + return err; +} + +static int kcm_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + int val, len; + + if (level != SOL_KCM) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + if (len < 0) + return -EINVAL; + + switch (optname) { + case KCM_RECV_DISABLE: + val = kcm->rx_disabled; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) +{ + struct kcm_sock *tkcm; + struct list_head *head; + int index = 0; + + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * POLLHUP + */ + kcm->sk.sk_state = TCP_ESTABLISHED; + + /* Add to mux's kcm sockets list */ + kcm->mux = mux; + spin_lock_bh(&mux->lock); + + head = &mux->kcm_socks; + list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { + if (tkcm->index != index) + break; + head = &tkcm->kcm_sock_list; + index++; + } + + list_add(&kcm->kcm_sock_list, head); + kcm->index = index; + + mux->kcm_socks_cnt++; + spin_unlock_bh(&mux->lock); + + INIT_WORK(&kcm->tx_work, kcm_tx_work); + + spin_lock_bh(&mux->rx_lock); + kcm_rcv_ready(kcm); + spin_unlock_bh(&mux->rx_lock); +} + +static void kcm_rx_msg_timeout(unsigned long arg) +{ + struct kcm_psock *psock = (struct kcm_psock *)arg; + + /* Message assembly timed out */ + KCM_STATS_INCR(psock->stats.rx_msg_timeouts); + kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); +} + +static int kcm_attach(struct socket *sock, struct socket *csock, + struct bpf_prog *prog) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + struct kcm_mux *mux = kcm->mux; + struct sock *csk; + struct kcm_psock *psock = NULL, *tpsock; + struct list_head *head; + int index = 0; + + if (csock->ops->family != PF_INET && + csock->ops->family != PF_INET6) + return -EINVAL; + + csk = csock->sk; + if (!csk) + return -EINVAL; + + /* Only support TCP for now */ + if (csk->sk_protocol != IPPROTO_TCP) + return -EINVAL; + + psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); + if (!psock) + return -ENOMEM; + + psock->mux = mux; + psock->sk = csk; + psock->bpf_prog = prog; + + setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, + (unsigned long)psock); + + INIT_WORK(&psock->rx_work, psock_rx_work); + INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); + + sock_hold(csk); + + write_lock_bh(&csk->sk_callback_lock); + psock->save_data_ready = csk->sk_data_ready; + psock->save_write_space = csk->sk_write_space; + psock->save_state_change = csk->sk_state_change; + csk->sk_user_data = psock; + csk->sk_data_ready = psock_tcp_data_ready; + csk->sk_write_space = psock_tcp_write_space; + csk->sk_state_change = psock_tcp_state_change; + write_unlock_bh(&csk->sk_callback_lock); + + /* Finished initialization, now add the psock to the MUX. */ + spin_lock_bh(&mux->lock); + head = &mux->psocks; + list_for_each_entry(tpsock, &mux->psocks, psock_list) { + if (tpsock->index != index) + break; + head = &tpsock->psock_list; + index++; + } + + list_add(&psock->psock_list, head); + psock->index = index; + + KCM_STATS_INCR(mux->stats.psock_attach); + mux->psocks_cnt++; + psock_now_avail(psock); + spin_unlock_bh(&mux->lock); + + /* Schedule RX work in case there are already bytes queued */ + queue_work(kcm_wq, &psock->rx_work); + + return 0; +} + +static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) +{ + struct socket *csock; + struct bpf_prog *prog; + int err; + + csock = sockfd_lookup(info->fd, &err); + if (!csock) + return -ENOENT; + + prog = bpf_prog_get(info->bpf_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out; + } + + if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + err = -EINVAL; + goto out; + } + + err = kcm_attach(sock, csock, prog); + if (err) { + bpf_prog_put(prog); + goto out; + } + + /* Keep reference on file also */ + + return 0; +out: + fput(csock->file); + return err; +} + +static void kcm_unattach(struct kcm_psock *psock) +{ + struct sock *csk = psock->sk; + struct kcm_mux *mux = psock->mux; + + /* Stop getting callbacks from TCP socket. After this there should + * be no way to reserve a kcm for this psock. + */ + write_lock_bh(&csk->sk_callback_lock); + csk->sk_user_data = NULL; + csk->sk_data_ready = psock->save_data_ready; + csk->sk_write_space = psock->save_write_space; + csk->sk_state_change = psock->save_state_change; + psock->rx_stopped = 1; + + if (WARN_ON(psock->rx_kcm)) { + write_unlock_bh(&csk->sk_callback_lock); + return; + } + + spin_lock_bh(&mux->rx_lock); + + /* Stop receiver activities. After this point psock should not be + * able to get onto ready list either through callbacks or work. + */ + if (psock->ready_rx_msg) { + list_del(&psock->psock_ready_list); + kfree_skb(psock->ready_rx_msg); + psock->ready_rx_msg = NULL; + KCM_STATS_INCR(mux->stats.rx_ready_drops); + } + + spin_unlock_bh(&mux->rx_lock); + + write_unlock_bh(&csk->sk_callback_lock); + + del_timer_sync(&psock->rx_msg_timer); + cancel_work_sync(&psock->rx_work); + cancel_delayed_work_sync(&psock->rx_delayed_work); + + bpf_prog_put(psock->bpf_prog); + + kfree_skb(psock->rx_skb_head); + psock->rx_skb_head = NULL; + + spin_lock_bh(&mux->lock); + + aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); + + KCM_STATS_INCR(mux->stats.psock_unattach); + + if (psock->tx_kcm) { + /* psock was reserved. Just mark it finished and we will clean + * up in the kcm paths, we need kcm lock which can not be + * acquired here. + */ + KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); + spin_unlock_bh(&mux->lock); + + /* We are unattaching a socket that is reserved. Abort the + * socket since we may be out of sync in sending on it. We need + * to do this without the mux lock. + */ + kcm_abort_tx_psock(psock, EPIPE, false); + + spin_lock_bh(&mux->lock); + if (!psock->tx_kcm) { + /* psock now unreserved in window mux was unlocked */ + goto no_reserved; + } + psock->done = 1; + + /* Commit done before queuing work to process it */ + smp_mb(); + + /* Queue tx work to make sure psock->done is handled */ + queue_work(kcm_wq, &psock->tx_kcm->tx_work); + spin_unlock_bh(&mux->lock); + } else { +no_reserved: + if (!psock->tx_stopped) + list_del(&psock->psock_avail_list); + list_del(&psock->psock_list); + mux->psocks_cnt--; + spin_unlock_bh(&mux->lock); + + sock_put(csk); + fput(csk->sk_socket->file); + kmem_cache_free(kcm_psockp, psock); + } +} + +static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + struct socket *csock; + struct sock *csk; + int err; + + csock = sockfd_lookup(info->fd, &err); + if (!csock) + return -ENOENT; + + csk = csock->sk; + if (!csk) { + err = -EINVAL; + goto out; + } + + err = -ENOENT; + + spin_lock_bh(&mux->lock); + + list_for_each_entry(psock, &mux->psocks, psock_list) { + if (psock->sk != csk) + continue; + + /* Found the matching psock */ + + if (psock->unattaching || WARN_ON(psock->done)) { + err = -EALREADY; + break; + } + + psock->unattaching = 1; + + spin_unlock_bh(&mux->lock); + + kcm_unattach(psock); + + err = 0; + goto out; + } + + spin_unlock_bh(&mux->lock); + +out: + fput(csock->file); + return err; +} + +static struct proto kcm_proto = { + .name = "KCM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct kcm_sock), +}; + +/* Clone a kcm socket. */ +static int kcm_clone(struct socket *osock, struct kcm_clone *info, + struct socket **newsockp) +{ + struct socket *newsock; + struct sock *newsk; + struct file *newfile; + int err, newfd; + + err = -ENFILE; + newsock = sock_alloc(); + if (!newsock) + goto out; + + newsock->type = osock->type; + newsock->ops = osock->ops; + + __module_get(newsock->ops->owner); + + newfd = get_unused_fd_flags(0); + if (unlikely(newfd < 0)) { + err = newfd; + goto out_fd_fail; + } + + newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); + if (unlikely(IS_ERR(newfile))) { + err = PTR_ERR(newfile); + goto out_sock_alloc_fail; + } + + newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, + &kcm_proto, true); + if (!newsk) { + err = -ENOMEM; + goto out_sk_alloc_fail; + } + + sock_init_data(newsock, newsk); + init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); + + fd_install(newfd, newfile); + *newsockp = newsock; + info->fd = newfd; + + return 0; + +out_sk_alloc_fail: + fput(newfile); +out_sock_alloc_fail: + put_unused_fd(newfd); +out_fd_fail: + sock_release(newsock); +out: + return err; +} + +static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case SIOCKCMATTACH: { + struct kcm_attach info; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_attach_ioctl(sock, &info); + + break; + } + case SIOCKCMUNATTACH: { + struct kcm_unattach info; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_unattach_ioctl(sock, &info); + + break; + } + case SIOCKCMCLONE: { + struct kcm_clone info; + struct socket *newsock = NULL; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_clone(sock, &info, &newsock); + + if (!err) { + if (copy_to_user((void __user *)arg, &info, + sizeof(info))) { + err = -EFAULT; + sock_release(newsock); + } + } + + break; + } + default: + err = -ENOIOCTLCMD; + break; + } + + return err; +} + +static void free_mux(struct rcu_head *rcu) +{ + struct kcm_mux *mux = container_of(rcu, + struct kcm_mux, rcu); + + kmem_cache_free(kcm_muxp, mux); +} + +static void release_mux(struct kcm_mux *mux) +{ + struct kcm_net *knet = mux->knet; + struct kcm_psock *psock, *tmp_psock; + + /* Release psocks */ + list_for_each_entry_safe(psock, tmp_psock, + &mux->psocks, psock_list) { + if (!WARN_ON(psock->unattaching)) + kcm_unattach(psock); + } + + if (WARN_ON(mux->psocks_cnt)) + return; + + __skb_queue_purge(&mux->rx_hold_queue); + + mutex_lock(&knet->mutex); + aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); + aggregate_psock_stats(&mux->aggregate_psock_stats, + &knet->aggregate_psock_stats); + list_del_rcu(&mux->kcm_mux_list); + knet->count--; + mutex_unlock(&knet->mutex); + + call_rcu(&mux->rcu, free_mux); +} + +static void kcm_done(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct sock *sk = &kcm->sk; + int socks_cnt; + + spin_lock_bh(&mux->rx_lock); + if (kcm->rx_psock) { + /* Cleanup in unreserve_rx_kcm */ + WARN_ON(kcm->done); + kcm->rx_disabled = 1; + kcm->done = 1; + spin_unlock_bh(&mux->rx_lock); + return; + } + + if (kcm->rx_wait) { + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + } + /* Move any pending receive messages to other kcm sockets */ + requeue_rx_msgs(mux, &sk->sk_receive_queue); + + spin_unlock_bh(&mux->rx_lock); + + if (WARN_ON(sk_rmem_alloc_get(sk))) + return; + + /* Detach from MUX */ + spin_lock_bh(&mux->lock); + + list_del(&kcm->kcm_sock_list); + mux->kcm_socks_cnt--; + socks_cnt = mux->kcm_socks_cnt; + + spin_unlock_bh(&mux->lock); + + if (!socks_cnt) { + /* We are done with the mux now. */ + release_mux(mux); + } + + WARN_ON(kcm->rx_wait); + + sock_put(&kcm->sk); +} + +/* Called by kcm_release to close a KCM socket. + * If this is the last KCM socket on the MUX, destroy the MUX. + */ +static int kcm_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm; + struct kcm_mux *mux; + struct kcm_psock *psock; + + if (!sk) + return 0; + + kcm = kcm_sk(sk); + mux = kcm->mux; + + sock_orphan(sk); + kfree_skb(kcm->seq_skb); + + lock_sock(sk); + /* Purge queue under lock to avoid race condition with tx_work trying + * to act when queue is nonempty. If tx_work runs after this point + * it will just return. + */ + __skb_queue_purge(&sk->sk_write_queue); + release_sock(sk); + + spin_lock_bh(&mux->lock); + if (kcm->tx_wait) { + /* Take of tx_wait list, after this point there should be no way + * that a psock will be assigned to this kcm. + */ + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + } + spin_unlock_bh(&mux->lock); + + /* Cancel work. After this point there should be no outside references + * to the kcm socket. + */ + cancel_work_sync(&kcm->tx_work); + + lock_sock(sk); + psock = kcm->tx_psock; + if (psock) { + /* A psock was reserved, so we need to kill it since it + * may already have some bytes queued from a message. We + * need to do this after removing kcm from tx_wait list. + */ + kcm_abort_tx_psock(psock, EPIPE, false); + unreserve_psock(kcm); + } + release_sock(sk); + + WARN_ON(kcm->tx_wait); + WARN_ON(kcm->tx_psock); + + sock->sk = NULL; + + kcm_done(kcm); + + return 0; +} + +static const struct proto_ops kcm_dgram_ops = { + .family = PF_KCM, + .owner = THIS_MODULE, + .release = kcm_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = kcm_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = kcm_setsockopt, + .getsockopt = kcm_getsockopt, + .sendmsg = kcm_sendmsg, + .recvmsg = kcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = kcm_sendpage, +}; + +static const struct proto_ops kcm_seqpacket_ops = { + .family = PF_KCM, + .owner = THIS_MODULE, + .release = kcm_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = kcm_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = kcm_setsockopt, + .getsockopt = kcm_getsockopt, + .sendmsg = kcm_sendmsg, + .recvmsg = kcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = kcm_sendpage, + .splice_read = kcm_splice_read, +}; + +/* Create proto operation for kcm sockets */ +static int kcm_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + struct sock *sk; + struct kcm_mux *mux; + + switch (sock->type) { + case SOCK_DGRAM: + sock->ops = &kcm_dgram_ops; + break; + case SOCK_SEQPACKET: + sock->ops = &kcm_seqpacket_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + if (protocol != KCMPROTO_CONNECTED) + return -EPROTONOSUPPORT; + + sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); + if (!sk) + return -ENOMEM; + + /* Allocate a kcm mux, shared between KCM sockets */ + mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); + if (!mux) { + sk_free(sk); + return -ENOMEM; + } + + spin_lock_init(&mux->lock); + spin_lock_init(&mux->rx_lock); + INIT_LIST_HEAD(&mux->kcm_socks); + INIT_LIST_HEAD(&mux->kcm_rx_waiters); + INIT_LIST_HEAD(&mux->kcm_tx_waiters); + + INIT_LIST_HEAD(&mux->psocks); + INIT_LIST_HEAD(&mux->psocks_ready); + INIT_LIST_HEAD(&mux->psocks_avail); + + mux->knet = knet; + + /* Add new MUX to list */ + mutex_lock(&knet->mutex); + list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); + knet->count++; + mutex_unlock(&knet->mutex); + + skb_queue_head_init(&mux->rx_hold_queue); + + /* Init KCM socket */ + sock_init_data(sock, sk); + init_kcm_sock(kcm_sk(sk), mux); + + return 0; +} + +static struct net_proto_family kcm_family_ops = { + .family = PF_KCM, + .create = kcm_create, + .owner = THIS_MODULE, +}; + +static __net_init int kcm_init_net(struct net *net) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + + INIT_LIST_HEAD_RCU(&knet->mux_list); + mutex_init(&knet->mutex); + + return 0; +} + +static __net_exit void kcm_exit_net(struct net *net) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + + /* All KCM sockets should be closed at this point, which should mean + * that all multiplexors and psocks have been destroyed. + */ + WARN_ON(!list_empty(&knet->mux_list)); +} + +static struct pernet_operations kcm_net_ops = { + .init = kcm_init_net, + .exit = kcm_exit_net, + .id = &kcm_net_id, + .size = sizeof(struct kcm_net), +}; + +static int __init kcm_init(void) +{ + int err = -ENOMEM; + + kcm_muxp = kmem_cache_create("kcm_mux_cache", + sizeof(struct kcm_mux), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (!kcm_muxp) + goto fail; + + kcm_psockp = kmem_cache_create("kcm_psock_cache", + sizeof(struct kcm_psock), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (!kcm_psockp) + goto fail; + + kcm_wq = create_singlethread_workqueue("kkcmd"); + if (!kcm_wq) + goto fail; + + err = proto_register(&kcm_proto, 1); + if (err) + goto fail; + + err = sock_register(&kcm_family_ops); + if (err) + goto sock_register_fail; + + err = register_pernet_device(&kcm_net_ops); + if (err) + goto net_ops_fail; + + err = kcm_proc_init(); + if (err) + goto proc_init_fail; + + return 0; + +proc_init_fail: + unregister_pernet_device(&kcm_net_ops); + +net_ops_fail: + sock_unregister(PF_KCM); + +sock_register_fail: + proto_unregister(&kcm_proto); + +fail: + kmem_cache_destroy(kcm_muxp); + kmem_cache_destroy(kcm_psockp); + + if (kcm_wq) + destroy_workqueue(kcm_wq); + + return err; +} + +static void __exit kcm_exit(void) +{ + kcm_proc_exit(); + unregister_pernet_device(&kcm_net_ops); + sock_unregister(PF_KCM); + proto_unregister(&kcm_proto); + destroy_workqueue(kcm_wq); + + kmem_cache_destroy(kcm_muxp); + kmem_cache_destroy(kcm_psockp); +} + +module_init(kcm_init); +module_exit(kcm_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_KCM); + diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index afca2eb4d..6edfa9980 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1376,9 +1376,9 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, sizeof(udp_conf.peer_ip6)); udp_conf.use_udp6_tx_checksums = - cfg->udp6_zero_tx_checksums; + ! cfg->udp6_zero_tx_checksums; udp_conf.use_udp6_rx_checksums = - cfg->udp6_zero_rx_checksums; + ! cfg->udp6_zero_rx_checksums; } else #endif { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 9ee4ddb6b..cd479903d 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -25,6 +25,7 @@ #include <net/udp.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> @@ -720,7 +721,7 @@ static struct proto l2tp_ip6_prot = { .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), #ifdef CONFIG_COMPAT diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 8e5ead366..e925037fa 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -17,7 +17,7 @@ * @dev: targeted interface */ -int l3mdev_master_ifindex_rcu(struct net_device *dev) +int l3mdev_master_ifindex_rcu(const struct net_device *dev) { int ifindex = 0; @@ -28,8 +28,15 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev) ifindex = dev->ifindex; } else if (netif_is_l3_slave(dev)) { struct net_device *master; + struct net_device *_dev = (struct net_device *)dev; - master = netdev_master_upper_dev_get_rcu(dev); + /* netdev_master_upper_dev_get_rcu calls + * list_first_or_null_rcu to walk the upper dev list. + * list_first_or_null_rcu does not handle a const arg. We aren't + * making changes, just want the master device from that list so + * typecast to remove the const + */ + master = netdev_master_upper_dev_get_rcu(_dev); if (master) ifindex = master->ifindex; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8dab4e569..8ae3ed97d 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; -static int llc_ui_wait_for_conn(struct sock *sk, long timeout); +static long llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); @@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) return rc; } -static int llc_ui_wait_for_conn(struct sock *sk, long timeout) +static long llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT(wait); @@ -626,6 +626,7 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; + memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 367784be5..3a8f881b2 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation + * Copyright(c) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -61,16 +62,25 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, { struct ieee80211_local *local = sta->local; struct tid_ampdu_rx *tid_rx; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_RX_STOP, + .tid = tid, + .amsdu = false, + .timeout = 0, + .ssn = 0, + }; lockdep_assert_held(&sta->ampdu_mlme.mtx); tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid], lockdep_is_held(&sta->ampdu_mlme.mtx)); - if (!tid_rx) + if (!test_bit(tid, sta->ampdu_mlme.agg_session_valid)) return; RCU_INIT_POINTER(sta->ampdu_mlme.tid_rx[tid], NULL); + __clear_bit(tid, sta->ampdu_mlme.agg_session_valid); ht_dbg(sta->sdata, "Rx BA session stop requested for %pM tid %u %s reason: %d\n", @@ -78,8 +88,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", (int)reason); - if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP, - &sta->sta, tid, NULL, 0, false)) + if (drv_ampdu_action(local, sta->sdata, ¶ms)) sdata_info(sta->sdata, "HW problem - can not stop rx aggregation for %pM tid %d\n", sta->sta.addr, tid); @@ -89,6 +98,13 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, ieee80211_send_delba(sta->sdata, sta->sta.addr, tid, WLAN_BACK_RECIPIENT, reason); + /* + * return here in case tid_rx is not assigned - which will happen if + * IEEE80211_HW_SUPPORTS_REORDERING_BUFFER is set. + */ + if (!tid_rx) + return; + del_timer_sync(&tid_rx->session_timer); /* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */ @@ -237,6 +253,15 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_RX_START, + .tid = tid, + .amsdu = false, + .timeout = timeout, + .ssn = start_seq_num, + }; + int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; @@ -275,11 +300,12 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > local->hw.max_rx_aggregation_subframes) buf_size = local->hw.max_rx_aggregation_subframes; + params.buf_size = buf_size; /* examine state machine */ mutex_lock(&sta->ampdu_mlme.mtx); - if (sta->ampdu_mlme.tid_rx[tid]) { + if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { ht_dbg_ratelimited(sta->sdata, "unexpected AddBA Req from %pM on tid %u\n", sta->sta.addr, tid); @@ -290,6 +316,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, false); } + if (ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) { + ret = drv_ampdu_action(local, sta->sdata, ¶ms); + ht_dbg(sta->sdata, + "Rx A-MPDU request on %pM tid %d result %d\n", + sta->sta.addr, tid, ret); + if (!ret) + status = WLAN_STATUS_SUCCESS; + goto end; + } + /* prepare A-MPDU MLME for Rx aggregation */ tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL); if (!tid_agg_rx) @@ -322,8 +358,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, for (i = 0; i < buf_size; i++) __skb_queue_head_init(&tid_agg_rx->reorder_buf[i]); - ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, - &sta->sta, tid, &start_seq_num, 0, false); + ret = drv_ampdu_action(local, sta->sdata, ¶ms); ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n", sta->sta.addr, tid, ret); if (ret) { @@ -341,6 +376,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; tid_agg_rx->auto_seq = auto_seq; + tid_agg_rx->reorder_buf_filtered = 0; status = WLAN_STATUS_SUCCESS; /* activate it for RX */ @@ -352,6 +388,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, } end: + if (status == WLAN_STATUS_SUCCESS) + __set_bit(tid, sta->ampdu_mlme.agg_session_valid); mutex_unlock(&sta->ampdu_mlme.mtx); end_no_lock: diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ff757181b..4932e9f24 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation + * Copyright(c) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -295,7 +296,14 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, { struct ieee80211_local *local = sta->local; struct tid_ampdu_tx *tid_tx; - enum ieee80211_ampdu_mlme_action action; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .tid = tid, + .buf_size = 0, + .amsdu = false, + .timeout = 0, + .ssn = 0, + }; int ret; lockdep_assert_held(&sta->ampdu_mlme.mtx); @@ -304,10 +312,10 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, case AGG_STOP_DECLINED: case AGG_STOP_LOCAL_REQUEST: case AGG_STOP_PEER_REQUEST: - action = IEEE80211_AMPDU_TX_STOP_CONT; + params.action = IEEE80211_AMPDU_TX_STOP_CONT; break; case AGG_STOP_DESTROY_STA: - action = IEEE80211_AMPDU_TX_STOP_FLUSH; + params.action = IEEE80211_AMPDU_TX_STOP_FLUSH; break; default: WARN_ON_ONCE(1); @@ -330,9 +338,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, spin_unlock_bh(&sta->lock); if (reason != AGG_STOP_DESTROY_STA) return -EALREADY; - ret = drv_ampdu_action(local, sta->sdata, - IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, - &sta->sta, tid, NULL, 0, false); + params.action = IEEE80211_AMPDU_TX_STOP_FLUSH_CONT; + ret = drv_ampdu_action(local, sta->sdata, ¶ms); WARN_ON_ONCE(ret); return 0; } @@ -381,8 +388,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, WLAN_BACK_INITIATOR; tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST; - ret = drv_ampdu_action(local, sta->sdata, action, - &sta->sta, tid, NULL, 0, false); + ret = drv_ampdu_action(local, sta->sdata, ¶ms); /* HW shall not deny going back to legacy */ if (WARN_ON(ret)) { @@ -445,7 +451,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) struct tid_ampdu_tx *tid_tx; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - u16 start_seq_num; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_TX_START, + .tid = tid, + .buf_size = 0, + .amsdu = false, + .timeout = 0, + }; int ret; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -467,10 +480,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ synchronize_net(); - start_seq_num = sta->tid_seq[tid] >> 4; - - ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START, - &sta->sta, tid, &start_seq_num, 0, false); + params.ssn = sta->tid_seq[tid] >> 4; + ret = drv_ampdu_action(local, sdata, ¶ms); if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", @@ -499,7 +510,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, - tid_tx->dialog_token, start_seq_num, + tid_tx->dialog_token, params.ssn, IEEE80211_MAX_AMPDU_BUF, tid_tx->timeout); } @@ -684,18 +695,24 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, struct sta_info *sta, u16 tid) { struct tid_ampdu_tx *tid_tx; + struct ieee80211_ampdu_params params = { + .sta = &sta->sta, + .action = IEEE80211_AMPDU_TX_OPERATIONAL, + .tid = tid, + .timeout = 0, + .ssn = 0, + }; lockdep_assert_held(&sta->ampdu_mlme.mtx); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + params.buf_size = tid_tx->buf_size; + params.amsdu = tid_tx->amsdu; ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n", sta->sta.addr, tid); - drv_ampdu_action(local, sta->sdata, - IEEE80211_AMPDU_TX_OPERATIONAL, - &sta->sta, tid, NULL, tid_tx->buf_size, - tid_tx->amsdu); + drv_ampdu_action(local, sta->sdata, ¶ms); /* * synchronize with TX path, while splicing the TX path diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 166a29fe6..fe1704c4e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -339,8 +339,9 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: - iv32 = key->u.tkip.tx.iv32; - iv16 = key->u.tkip.tx.iv16; + pn64 = atomic64_read(&key->conf.tx_pn); + iv32 = TKIP_PN_TO_IV32(pn64); + iv16 = TKIP_PN_TO_IV16(pn64); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { @@ -1131,6 +1132,34 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta->sta.max_sp = params->max_sp; } + /* The sender might not have sent the last bit, consider it to be 0 */ + if (params->ext_capab_len >= 8) { + u8 val = (params->ext_capab[7] & + WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB) >> 7; + + /* we did get all the bits, take the MSB as well */ + if (params->ext_capab_len >= 9) { + u8 val_msb = params->ext_capab[8] & + WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB; + val_msb <<= 1; + val |= val_msb; + } + + switch (val) { + case 1: + sta->sta.max_amsdu_subframes = 32; + break; + case 2: + sta->sta.max_amsdu_subframes = 16; + break; + case 3: + sta->sta.max_amsdu_subframes = 8; + break; + default: + sta->sta.max_amsdu_subframes = 0; + } + } + /* * cfg80211 validates this (1-2007) and allows setting the AID * only when creating a new station entry @@ -1160,6 +1189,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, params->ht_capa, sta); + /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 1d1b9b7bd..74142d07a 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -231,7 +231,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata) !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; - if (!sta->uploaded) + if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta)); @@ -343,8 +343,10 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *chandef) { - if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) + if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { + ieee80211_recalc_chanctx_min_def(local, ctx); return; + } WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 3e24d0ddb..4ab5c522c 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -126,6 +126,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), + FLAG(SUPPORTS_REORDERING_BUFFER), #undef FLAG }; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 7961e7d0b..a2ef95f16 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -132,9 +132,10 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: + pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%08x %04x\n", - key->u.tkip.tx.iv32, - key->u.tkip.tx.iv16); + TKIP_PN_TO_IV32(pn), + TKIP_PN_TO_IV16(pn)); break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index ca1fe5576..c258f1041 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -284,9 +284,7 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local, int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu) + struct ieee80211_ampdu_params *params) { int ret = -EOPNOTSUPP; @@ -296,12 +294,10 @@ int drv_ampdu_action(struct ieee80211_local *local, if (!check_sdata_in_driver(sdata)) return -EIO; - trace_drv_ampdu_action(local, sdata, action, sta, tid, - ssn, buf_size, amsdu); + trace_drv_ampdu_action(local, sdata, params); if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, - sta, tid, ssn, buf_size, amsdu); + ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params); trace_drv_return_int(local, ret); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 154ce4b13..18b0d65ba 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -585,9 +585,7 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local) int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu); + struct ieee80211_ampdu_params *params); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 7a76ce639..f4a528773 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -230,6 +230,11 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, /* set Rx highest rate */ ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest; + if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU) + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935; + else + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839; + apply: changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 1b33d8990..fc3238376 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1051,9 +1051,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def chandef; enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; - ieee80211_ht_oper_to_chandef(channel, - elems->ht_operation, - &chandef); + cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT); + ieee80211_chandef_ht_oper(elems->ht_operation, &chandef); memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -1067,9 +1066,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; - ieee80211_vht_oper_to_chandef(channel, - elems->vht_operation, - &chandef); + ieee80211_chandef_vht_oper(elems->vht_operation, + &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, &cap_ie, sta); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f006f4a44..422003540 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -716,7 +716,6 @@ struct ieee80211_if_mesh { * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver - * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), @@ -724,7 +723,6 @@ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), - IEEE80211_SDATA_MU_MIMO_OWNER = BIT(6), }; /** @@ -804,6 +802,7 @@ enum txq_info_flags { struct txq_info { struct sk_buff_head queue; unsigned long flags; + unsigned long byte_cnt; /* keep last! */ struct ieee80211_txq txq; @@ -1466,7 +1465,13 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START && status->flag & RX_FLAG_MACTIME_END); - return status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END); + if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END)) + return true; + /* can't handle HT/VHT preamble yet */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START && + !(status->flag & (RX_FLAG_HT | RX_FLAG_VHT))) + return true; + return false; } u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, @@ -1714,6 +1719,12 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); +enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); +enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta); +void ieee80211_sta_set_rx_nss(struct sta_info *sta); +void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum ieee80211_band band); @@ -1829,20 +1840,6 @@ static inline void ieee802_11_parse_elems(const u8 *start, size_t len, ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0); } -static inline bool ieee80211_rx_reorder_ready(struct sk_buff_head *frames) -{ - struct sk_buff *tail = skb_peek_tail(frames); - struct ieee80211_rx_status *status; - - if (!tail) - return false; - - status = IEEE80211_SKB_RXCB(tail); - if (status->flag & RX_FLAG_AMSDU_MORE) - return false; - - return true; -} extern const int ieee802_1d_to_ac[8]; @@ -1986,12 +1983,10 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ -void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_ht_operation *ht_oper, - struct cfg80211_chan_def *chandef); -void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_vht_operation *oper, - struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, + struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, + struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index bcb0a1b64..e1cb22c16 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -979,6 +979,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, spin_lock_bh(&txqi->queue.lock); ieee80211_purge_tx_queue(&local->hw, &txqi->queue); + txqi->byte_cnt = 0; spin_unlock_bh(&txqi->queue.lock); atomic_set(&sdata->txqs_len[txqi->txq.ac], 0); @@ -1274,6 +1275,16 @@ static void ieee80211_iface_work(struct work_struct *work) } } mutex_unlock(&local->sta_mtx); + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_VHT) { + switch (mgmt->u.action.u.vht_group_notif.action_code) { + case WLAN_VHT_ACTION_GROUPID_MGMT: + ieee80211_process_mu_groups(sdata, mgmt); + break; + default: + WARN_ON(1); + break; + } } else if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *)mgmt; /* diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 5e5bc599d..3df7b0392 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -932,50 +932,6 @@ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify); -void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq) -{ - struct ieee80211_key *key; - u64 pn64; - - if (WARN_ON(!(keyconf->flags & IEEE80211_KEY_FLAG_GENERATE_IV))) - return; - - key = container_of(keyconf, struct ieee80211_key, conf); - - switch (key->conf.cipher) { - case WLAN_CIPHER_SUITE_TKIP: - seq->tkip.iv32 = key->u.tkip.tx.iv32; - seq->tkip.iv16 = key->u.tkip.tx.iv16; - break; - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_CCMP_256: - case WLAN_CIPHER_SUITE_AES_CMAC: - case WLAN_CIPHER_SUITE_BIP_CMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_cmac)); - case WLAN_CIPHER_SUITE_BIP_GMAC_128: - case WLAN_CIPHER_SUITE_BIP_GMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_gmac)); - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), gcmp)); - pn64 = atomic64_read(&key->conf.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; - default: - WARN_ON(1); - } -} -EXPORT_SYMBOL(ieee80211_get_key_tx_seq); - void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { @@ -1029,48 +985,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, } EXPORT_SYMBOL(ieee80211_get_key_rx_seq); -void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq) -{ - struct ieee80211_key *key; - u64 pn64; - - key = container_of(keyconf, struct ieee80211_key, conf); - - switch (key->conf.cipher) { - case WLAN_CIPHER_SUITE_TKIP: - key->u.tkip.tx.iv32 = seq->tkip.iv32; - key->u.tkip.tx.iv16 = seq->tkip.iv16; - break; - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_CCMP_256: - case WLAN_CIPHER_SUITE_AES_CMAC: - case WLAN_CIPHER_SUITE_BIP_CMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_cmac)); - case WLAN_CIPHER_SUITE_BIP_GMAC_128: - case WLAN_CIPHER_SUITE_BIP_GMAC_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), aes_gmac)); - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != - offsetof(typeof(*seq), gcmp)); - pn64 = (u64)seq->ccmp.pn[5] | - ((u64)seq->ccmp.pn[4] << 8) | - ((u64)seq->ccmp.pn[3] << 16) | - ((u64)seq->ccmp.pn[2] << 24) | - ((u64)seq->ccmp.pn[1] << 32) | - ((u64)seq->ccmp.pn[0] << 40); - atomic64_set(&key->conf.tx_pn, pn64); - break; - default: - WARN_ON(1); - break; - } -} -EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq); - void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 9951ef063..4aa20cef0 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -44,13 +44,17 @@ enum ieee80211_internal_tkip_state { }; struct tkip_ctx { - u32 iv32; /* current iv32 */ - u16 iv16; /* current iv16 */ u16 p1k[5]; /* p1k cache */ u32 p1k_iv32; /* iv32 for which p1k computed */ enum ieee80211_internal_tkip_state state; }; +struct tkip_ctx_rx { + struct tkip_ctx ctx; + u32 iv32; /* current iv32 */ + u16 iv16; /* current iv16 */ +}; + struct ieee80211_key { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -71,7 +75,7 @@ struct ieee80211_key { struct tkip_ctx tx; /* last received RSC */ - struct tkip_ctx rx[IEEE80211_NUM_TIDS]; + struct tkip_ctx_rx rx[IEEE80211_NUM_TIDS]; /* number of mic failures */ u32 mic_failures; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6f85b6ab8..d32cefcb6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -91,11 +91,10 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, if (sdata->vif.bss_conf.basic_rates != basic_rates) return false; - ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, - ie->ht_operation, &sta_chan_def); - - ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, - ie->vht_operation, &sta_chan_def); + cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, + NL80211_CHAN_NO_HT); + ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); + ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 4a8019f79..87c017a3b 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -137,8 +137,6 @@ struct mesh_path { * @copy_node: function to copy nodes of the table * @size_order: determines size of the table, there will be 2^size_order hash * buckets - * @mean_chain_len: maximum average length for the hash buckets' list, if it is - * reached, the table will grow * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. * @@ -154,7 +152,6 @@ struct mesh_table { void (*free_node) (struct hlist_node *p, bool free_leafs); int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl); int size_order; - int mean_chain_len; struct hlist_head *known_gates; spinlock_t gates_lock; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index c6be0b4f4..002244bca 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -205,9 +205,9 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - skb_set_mac_header(skb, 0); - skb_set_network_header(skb, 0); - skb_set_transport_header(skb, 0); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */ skb_set_queue_mapping(skb, IEEE80211_AC_VO); @@ -530,7 +530,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; - u32 orig_sn, target_sn, lifetime, target_metric; + u32 orig_sn, target_sn, lifetime, target_metric = 0; bool reply = false; bool forward = true; bool root_is_gate; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index dadf8dc6f..2ba7aa56b 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -55,16 +55,21 @@ int mpp_paths_generation; static DEFINE_RWLOCK(pathtbl_resize_lock); +static inline struct mesh_table *resize_dereference_paths( + struct mesh_table __rcu *table) +{ + return rcu_dereference_protected(table, + lockdep_is_held(&pathtbl_resize_lock)); +} + static inline struct mesh_table *resize_dereference_mesh_paths(void) { - return rcu_dereference_protected(mesh_paths, - lockdep_is_held(&pathtbl_resize_lock)); + return resize_dereference_paths(mesh_paths); } static inline struct mesh_table *resize_dereference_mpp_paths(void) { - return rcu_dereference_protected(mpp_paths, - lockdep_is_held(&pathtbl_resize_lock)); + return resize_dereference_paths(mpp_paths); } /* @@ -160,11 +165,10 @@ static int mesh_table_grow(struct mesh_table *oldtbl, int i; if (atomic_read(&oldtbl->entries) - < oldtbl->mean_chain_len * (oldtbl->hash_mask + 1)) + < MEAN_CHAIN_LEN * (oldtbl->hash_mask + 1)) return -EAGAIN; newtbl->free_node = oldtbl->free_node; - newtbl->mean_chain_len = oldtbl->mean_chain_len; newtbl->copy_node = oldtbl->copy_node; newtbl->known_gates = oldtbl->known_gates; atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries)); @@ -585,7 +589,7 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, hlist_add_head_rcu(&new_node->list, bucket); if (atomic_inc_return(&tbl->entries) >= - tbl->mean_chain_len * (tbl->hash_mask + 1)) + MEAN_CHAIN_LEN * (tbl->hash_mask + 1)) grow = 1; mesh_paths_generation++; @@ -714,7 +718,7 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, hlist_add_head_rcu(&new_node->list, bucket); if (atomic_inc_return(&tbl->entries) >= - tbl->mean_chain_len * (tbl->hash_mask + 1)) + MEAN_CHAIN_LEN * (tbl->hash_mask + 1)) grow = 1; spin_unlock(&tbl->hashwlock[hash_idx]); @@ -835,6 +839,29 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) rcu_read_unlock(); } +static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, + const u8 *proxy) +{ + struct mesh_table *tbl; + struct mesh_path *mpp; + struct mpath_node *node; + int i; + + rcu_read_lock(); + read_lock_bh(&pathtbl_resize_lock); + tbl = resize_dereference_mpp_paths(); + for_each_mesh_entry(tbl, node, i) { + mpp = node->mpath; + if (ether_addr_equal(mpp->mpp, proxy)) { + spin_lock(&tbl->hashwlock[i]); + __mesh_path_del(tbl, node); + spin_unlock(&tbl->hashwlock[i]); + } + } + read_unlock_bh(&pathtbl_resize_lock); + rcu_read_unlock(); +} + static void table_flush_by_iface(struct mesh_table *tbl, struct ieee80211_sub_if_data *sdata) { @@ -876,14 +903,17 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) } /** - * mesh_path_del - delete a mesh path from the table + * table_path_del - delete a path from the mesh or mpp table * - * @addr: dst address (ETH_ALEN length) + * @tbl: mesh or mpp path table * @sdata: local subif + * @addr: dst address (ETH_ALEN length) * * Returns: 0 if successful */ -int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) +static int table_path_del(struct mesh_table __rcu *rcu_tbl, + struct ieee80211_sub_if_data *sdata, + const u8 *addr) { struct mesh_table *tbl; struct mesh_path *mpath; @@ -892,8 +922,7 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) int hash_idx; int err = 0; - read_lock_bh(&pathtbl_resize_lock); - tbl = resize_dereference_mesh_paths(); + tbl = resize_dereference_paths(rcu_tbl); hash_idx = mesh_table_hash(addr, sdata, tbl); bucket = &tbl->hash_buckets[hash_idx]; @@ -909,9 +938,50 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) err = -ENXIO; enddel: - mesh_paths_generation++; spin_unlock(&tbl->hashwlock[hash_idx]); + return err; +} + +/** + * mesh_path_del - delete a mesh path from the table + * + * @addr: dst address (ETH_ALEN length) + * @sdata: local subif + * + * Returns: 0 if successful + */ +int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) +{ + int err = 0; + + /* flush relevant mpp entries first */ + mpp_flush_by_proxy(sdata, addr); + + read_lock_bh(&pathtbl_resize_lock); + err = table_path_del(mesh_paths, sdata, addr); + mesh_paths_generation++; read_unlock_bh(&pathtbl_resize_lock); + + return err; +} + +/** + * mpp_path_del - delete a mesh proxy path from the table + * + * @addr: addr address (ETH_ALEN length) + * @sdata: local subif + * + * Returns: 0 if successful + */ +static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) +{ + int err = 0; + + read_lock_bh(&pathtbl_resize_lock); + err = table_path_del(mpp_paths, sdata, addr); + mpp_paths_generation++; + read_unlock_bh(&pathtbl_resize_lock); + return err; } @@ -1076,7 +1146,6 @@ int mesh_pathtbl_init(void) return -ENOMEM; tbl_path->free_node = &mesh_path_node_free; tbl_path->copy_node = &mesh_path_node_copy; - tbl_path->mean_chain_len = MEAN_CHAIN_LEN; tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC); if (!tbl_path->known_gates) { ret = -ENOMEM; @@ -1092,7 +1161,6 @@ int mesh_pathtbl_init(void) } tbl_mpp->free_node = &mesh_path_node_free; tbl_mpp->copy_node = &mesh_path_node_copy; - tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN; tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC); if (!tbl_mpp->known_gates) { ret = -ENOMEM; @@ -1131,6 +1199,17 @@ void mesh_path_expire(struct ieee80211_sub_if_data *sdata) time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) mesh_path_del(mpath->sdata, mpath->dst); } + + tbl = rcu_dereference(mpp_paths); + for_each_mesh_entry(tbl, node, i) { + if (node->mpath->sdata != sdata) + continue; + mpath = node->mpath; + if ((!(mpath->flags & MESH_PATH_FIXED)) && + time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) + mpp_path_del(mpath->sdata, mpath->dst); + } + rcu_read_unlock(); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index bd3d55eb2..a07e93c21 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -976,6 +976,10 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "Mesh plink error: no more free plinks\n"); goto out; } + + /* new matching peer */ + event = OPN_ACPT; + goto out; } else { if (!test_sta_flag(sta, WLAN_STA_AUTH)) { mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n"); @@ -985,12 +989,6 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, goto out; } - /* new matching peer */ - if (!sta) { - event = OPN_ACPT; - goto out; - } - switch (ftype) { case WLAN_SP_MESH_PEERING_OPEN: if (!matches_local) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index bfbb1acaf..281b8d6e5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6,7 +6,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -196,16 +196,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, /* check 40 MHz support, if we have it */ if (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) { - switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { - case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: - chandef->width = NL80211_CHAN_WIDTH_40; - chandef->center_freq1 += 10; - break; - case IEEE80211_HT_PARAM_CHA_SEC_BELOW: - chandef->width = NL80211_CHAN_WIDTH_40; - chandef->center_freq1 -= 10; - break; - } + ieee80211_chandef_ht_oper(ht_oper, chandef); } else { /* 40 MHz (and 80 MHz) must be supported for VHT */ ret = IEEE80211_STA_DISABLE_VHT; @@ -219,35 +210,11 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, goto out; } - vht_chandef.chan = channel; - vht_chandef.center_freq1 = - ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx, - channel->band); - vht_chandef.center_freq2 = 0; - - switch (vht_oper->chan_width) { - case IEEE80211_VHT_CHANWIDTH_USE_HT: - vht_chandef.width = chandef->width; - vht_chandef.center_freq1 = chandef->center_freq1; - break; - case IEEE80211_VHT_CHANWIDTH_80MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_80; - break; - case IEEE80211_VHT_CHANWIDTH_160MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_160; - break; - case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_80P80; - vht_chandef.center_freq2 = - ieee80211_channel_to_frequency( - vht_oper->center_freq_seg2_idx, - channel->band); - break; - default: + vht_chandef = *chandef; + if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, - "AP VHT operation IE has invalid channel width (%d), disable VHT\n", - vht_oper->chan_width); + "AP VHT information is invalid, disable VHT\n"); ret = IEEE80211_STA_DISABLE_VHT; goto out; } @@ -592,7 +559,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sub_if_data *other; list_for_each_entry_rcu(other, &local->interfaces, list) { - if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) { + if (other->vif.mu_mimo_owner) { disable_mu_mimo = true; break; } @@ -600,7 +567,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, if (disable_mu_mimo) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; else - sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER; + sdata->vif.mu_mimo_owner = true; } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; @@ -1638,8 +1605,7 @@ void ieee80211_dynamic_ps_timer(unsigned long data) void ieee80211_dfs_cac_timer_work(struct work_struct *work) { - struct delayed_work *delayed_work = - container_of(work, struct delayed_work, work); + struct delayed_work *delayed_work = to_delayed_work(work); struct ieee80211_sub_if_data *sdata = container_of(delayed_work, struct ieee80211_sub_if_data, dfs_cac_timer_work); @@ -2079,7 +2045,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); - sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; + + /* reset MU-MIMO ownership and group data */ + memset(sdata->vif.bss_conf.mu_group.membership, 0, + sizeof(sdata->vif.bss_conf.mu_group.membership)); + memset(sdata->vif.bss_conf.mu_group.position, 0, + sizeof(sdata->vif.bss_conf.mu_group.position)); + changed |= BSS_CHANGED_MU_GROUPS; + sdata->vif.mu_mimo_owner = false; sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -2536,7 +2509,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, eth_zero_addr(sdata->u.mgd.bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; - sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; + sdata->vif.mu_mimo_owner = false; + mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); mutex_unlock(&sdata->local->mtx); @@ -3571,6 +3545,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); + sdata_info(sdata, + "failed to follow AP %pM bandwidth change, disconnect\n", + bssid); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); @@ -3946,11 +3923,9 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) * We actually lost the connection ... or did we? * Let's make sure! */ - wiphy_debug(local->hw.wiphy, - "%s: No probe response from AP %pM" - " after %dms, disconnecting.\n", - sdata->name, - bssid, probe_wait_ms); + mlme_dbg(sdata, + "No probe response from AP %pM after %dms, disconnecting.\n", + bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, bssid, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); @@ -4536,6 +4511,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + sdata_info(sdata, + "disconnect from AP %pM for new auth to %pM\n", + ifmgd->associated->bssid, req->bss->bssid); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); @@ -4604,6 +4582,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + sdata_info(sdata, + "disconnect from AP %pM for new assoc to %pM\n", + ifmgd->associated->bssid, req->bss->bssid); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 261df74ea..dc27becb9 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright(c) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,6 +19,7 @@ #include <linux/etherdevice.h> #include <linux/rcupdate.h> #include <linux/export.h> +#include <linux/bitops.h> #include <net/mac80211.h> #include <net/ieee80211_radiotap.h> #include <asm/unaligned.h> @@ -122,7 +124,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, hdr = (void *)(skb->data + rtap_vendor_space); if (status->flag & (RX_FLAG_FAILED_FCS_CRC | - RX_FLAG_FAILED_PLCP_CRC)) + RX_FLAG_FAILED_PLCP_CRC | + RX_FLAG_ONLY_MONITOR)) return true; if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space)) @@ -507,7 +510,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } - if (!local->monitors) { + if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) { if (should_drop_frame(origskb, present_fcs_len, rtap_vendor_space)) { dev_kfree_skb(origskb); @@ -797,6 +800,26 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx, + int index) +{ + struct sk_buff_head *frames = &tid_agg_rx->reorder_buf[index]; + struct sk_buff *tail = skb_peek_tail(frames); + struct ieee80211_rx_status *status; + + if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index)) + return true; + + if (!tail) + return false; + + status = IEEE80211_SKB_RXCB(tail); + if (status->flag & RX_FLAG_AMSDU_MORE) + return false; + + return true; +} + static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, int index, @@ -811,7 +834,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, if (skb_queue_empty(skb_list)) goto no_frame; - if (!ieee80211_rx_reorder_ready(skb_list)) { + if (!ieee80211_rx_reorder_ready(tid_agg_rx, index)) { __skb_queue_purge(skb_list); goto no_frame; } @@ -825,6 +848,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, } no_frame: + tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index); tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num); } @@ -865,7 +889,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, /* release the buffer until next missing frame */ index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size; - if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) && + if (!ieee80211_rx_reorder_ready(tid_agg_rx, index) && tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any @@ -874,8 +898,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, int skipped = 1; for (j = (index + 1) % tid_agg_rx->buf_size; j != index; j = (j + 1) % tid_agg_rx->buf_size) { - if (!ieee80211_rx_reorder_ready( - &tid_agg_rx->reorder_buf[j])) { + if (!ieee80211_rx_reorder_ready(tid_agg_rx, j)) { skipped++; continue; } @@ -902,8 +925,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, skipped) & IEEE80211_SN_MASK; skipped = 0; } - } else while (ieee80211_rx_reorder_ready( - &tid_agg_rx->reorder_buf[index])) { + } else while (ieee80211_rx_reorder_ready(tid_agg_rx, index)) { ieee80211_release_reorder_frame(sdata, tid_agg_rx, index, frames); index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size; @@ -914,8 +936,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, for (; j != (index - 1) % tid_agg_rx->buf_size; j = (j + 1) % tid_agg_rx->buf_size) { - if (ieee80211_rx_reorder_ready( - &tid_agg_rx->reorder_buf[j])) + if (ieee80211_rx_reorder_ready(tid_agg_rx, j)) break; } @@ -986,7 +1007,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata index = mpdu_seq_num % tid_agg_rx->buf_size; /* check if we already stored this frame */ - if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) { + if (ieee80211_rx_reorder_ready(tid_agg_rx, index)) { dev_kfree_skb(skb); goto out; } @@ -1099,6 +1120,9 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + if (status->flag & RX_FLAG_DUP_VALIDATED) + return RX_CONTINUE; + /* * Drop duplicate 802.11 retransmissions * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery") @@ -2217,9 +2241,6 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) skb->dev = dev; __skb_queue_head_init(&frame_list); - if (skb_linearize(skb)) - return RX_DROP_UNUSABLE; - ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, rx->sdata->vif.type, rx->local->hw.extra_tx_headroom, true); @@ -2308,6 +2329,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) spin_lock_bh(&mppath->state_lock); if (!ether_addr_equal(mppath->mpp, mpp_addr)) memcpy(mppath->mpp, mpp_addr, ETH_ALEN); + mppath->exp_time = jiffies; spin_unlock_bh(&mppath->state_lock); } rcu_read_unlock(); @@ -2757,6 +2779,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) opmode, status->band); goto handled; } + case WLAN_VHT_ACTION_GROUPID_MGMT: { + if (len < IEEE80211_MIN_ACTION_SIZE + 25) + goto invalid; + goto queue; + } default: break; } @@ -3092,7 +3119,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom, false); - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); @@ -3294,6 +3321,85 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) ieee80211_rx_handlers(&rx, &frames); } +void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, + u16 ssn, u64 filtered, + u16 received_mpdus) +{ + struct sta_info *sta; + struct tid_ampdu_rx *tid_agg_rx; + struct sk_buff_head frames; + struct ieee80211_rx_data rx = { + /* This is OK -- must be QoS data frame */ + .security_idx = tid, + .seqno_idx = tid, + }; + int i, diff; + + if (WARN_ON(!pubsta || tid >= IEEE80211_NUM_TIDS)) + return; + + __skb_queue_head_init(&frames); + + sta = container_of(pubsta, struct sta_info, sta); + + rx.sta = sta; + rx.sdata = sta->sdata; + rx.local = sta->local; + + rcu_read_lock(); + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + goto out; + + spin_lock_bh(&tid_agg_rx->reorder_lock); + + if (received_mpdus >= IEEE80211_SN_MODULO >> 1) { + int release; + + /* release all frames in the reorder buffer */ + release = (tid_agg_rx->head_seq_num + tid_agg_rx->buf_size) % + IEEE80211_SN_MODULO; + ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, + release, &frames); + /* update ssn to match received ssn */ + tid_agg_rx->head_seq_num = ssn; + } else { + ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, ssn, + &frames); + } + + /* handle the case that received ssn is behind the mac ssn. + * it can be tid_agg_rx->buf_size behind and still be valid */ + diff = (tid_agg_rx->head_seq_num - ssn) & IEEE80211_SN_MASK; + if (diff >= tid_agg_rx->buf_size) { + tid_agg_rx->reorder_buf_filtered = 0; + goto release; + } + filtered = filtered >> diff; + ssn += diff; + + /* update bitmap */ + for (i = 0; i < tid_agg_rx->buf_size; i++) { + int index = (ssn + i) % tid_agg_rx->buf_size; + + tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index); + if (filtered & BIT_ULL(i)) + tid_agg_rx->reorder_buf_filtered |= BIT_ULL(index); + } + + /* now process also frames that the filter marking released */ + ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames); + +release: + spin_unlock_bh(&tid_agg_rx->reorder_lock); + + ieee80211_rx_handlers(&rx, &frames); + + out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_mark_rx_ba_filtered_frames); + /* main receive path */ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 23ed038cf..861b93ffb 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -67,6 +67,7 @@ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ + .insecure_elasticity = true, /* Disable chain-length checks. */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), @@ -116,6 +117,7 @@ static void __cleanup_single_sta(struct sta_info *sta) ieee80211_purge_tx_queue(&local->hw, &txqi->queue); atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]); + txqi->byte_cnt = 0; } } @@ -538,7 +540,6 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_recalc_min_chandef(sdata); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); @@ -565,6 +566,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) __cleanup_single_sta(sta); out_err: mutex_unlock(&local->sta_mtx); + kfree(sinfo); rcu_read_lock(); return err; } @@ -952,7 +954,6 @@ static void __sta_info_destroy_part2(struct sta_info *sta) rate_control_remove_sta_debugfs(sta); ieee80211_sta_debugfs_remove(sta); - ieee80211_recalc_min_chandef(sdata); cleanup_single_sta(sta); } @@ -1819,14 +1820,17 @@ int sta_info_move_state(struct sta_info *sta, clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: - if (sta->sta_state == IEEE80211_STA_NONE) + if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); - else if (sta->sta_state == IEEE80211_STA_ASSOC) + } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); + ieee80211_recalc_min_chandef(sta->sdata); + } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); + ieee80211_recalc_min_chandef(sta->sdata); } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index d6051629e..62193f4bc 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,6 +1,7 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright(c) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -167,6 +168,8 @@ struct tid_ampdu_tx { * * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an * A-MSDU with individually reported subframes. + * @reorder_buf_filtered: bitmap indicating where there are filtered frames in + * the reorder buffer that should be ignored when releasing frames * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. @@ -194,6 +197,7 @@ struct tid_ampdu_tx { struct tid_ampdu_rx { struct rcu_head rcu_head; spinlock_t reorder_lock; + u64 reorder_buf_filtered; struct sk_buff_head *reorder_buf; unsigned long *reorder_time; struct timer_list session_timer; @@ -212,20 +216,21 @@ struct tid_ampdu_rx { /** * struct sta_ampdu_mlme - STA aggregation information. * + * @mtx: mutex to protect all TX data (except non-NULL assignments + * to tid_tx[idx], which are protected by the sta spinlock) + * tid_start_tx is also protected by sta->lock. * @tid_rx: aggregation info for Rx per TID -- RCU protected - * @tid_tx: aggregation info for Tx per TID - * @tid_start_tx: sessions where start was requested - * @addba_req_num: number of times addBA request has been sent. - * @last_addba_req_time: timestamp of the last addBA request. - * @dialog_token_allocator: dialog token enumerator for each new session; - * @work: work struct for starting/stopping aggregation * @tid_rx_timer_expired: bitmap indicating on which TIDs the * RX timer expired until the work for it runs * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the * driver requested to close until the work for it runs - * @mtx: mutex to protect all TX data (except non-NULL assignments - * to tid_tx[idx], which are protected by the sta spinlock) - * tid_start_tx is also protected by sta->lock. + * @agg_session_valid: bitmap indicating which TID has a rx BA session open on + * @work: work struct for starting/stopping aggregation + * @tid_tx: aggregation info for Tx per TID + * @tid_start_tx: sessions where start was requested + * @last_addba_req_time: timestamp of the last addBA request. + * @addba_req_num: number of times addBA request has been sent. + * @dialog_token_allocator: dialog token enumerator for each new session; */ struct sta_ampdu_mlme { struct mutex mtx; @@ -233,6 +238,7 @@ struct sta_ampdu_mlme { struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS]; unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; + unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; /* tx */ struct work_struct work; struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS]; @@ -371,7 +377,6 @@ DECLARE_EWMA(signal, 1024, 8) * @uploaded: set to true when sta is uploaded to the driver * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) - * @beacon_loss_count: number of times beacon loss has triggered * @rcu_head: RCU head used for freeing this station struct * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 6101deb80..8b1b2ea03 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -697,7 +697,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, rtap_len, shift); /* XXX: is this sufficient for BPF? */ - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index c9eeb3f12..a29ea813b 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -4,7 +4,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015 - 2016 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -15,6 +15,7 @@ #include <linux/rtnetlink.h> #include "ieee80211_i.h" #include "driver-ops.h" +#include "rate.h" /* give usermode some time for retries in setting up the TDLS session */ #define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) @@ -302,7 +303,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, /* IEEE802.11ac-2013 Table E-4 */ u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 }; struct cfg80211_chan_def uc = sta->tdls_chandef; - enum nl80211_chan_width max_width = ieee80211_get_sta_bw(&sta->sta); + enum nl80211_chan_width max_width = ieee80211_sta_cap_chan_bw(sta); int i; /* only support upgrading non-narrow channels up to 80Mhz */ @@ -313,7 +314,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, if (max_width > NL80211_CHAN_WIDTH_80) max_width = NL80211_CHAN_WIDTH_80; - if (uc.width == max_width) + if (uc.width >= max_width) return; /* * Channel usage constrains in the IEEE802.11ac-2013 specification only @@ -324,6 +325,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(centers_80mhz); i++) if (abs(uc.chan->center_freq - centers_80mhz[i]) <= 30) { uc.center_freq1 = centers_80mhz[i]; + uc.center_freq2 = 0; uc.width = NL80211_CHAN_WIDTH_80; break; } @@ -332,7 +334,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, return; /* proceed to downgrade the chandef until usable or the same */ - while (uc.width > max_width && + while (uc.width > max_width || !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, sdata->wdev.iftype)) ieee80211_chandef_downgrade(&uc); @@ -1242,18 +1244,44 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, return ret; } -static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) +static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; + enum nl80211_chan_width width; + struct ieee80211_supported_band *sband; mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (conf) { + width = conf->def.width; + sband = local->hw.wiphy->bands[conf->def.chan->band]; ctx = container_of(conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_chantype(local, ctx); + + /* if width changed and a peer is given, update its BW */ + if (width != conf->def.width && sta && + test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) { + enum ieee80211_sta_rx_bandwidth bw; + + bw = ieee80211_chan_width_to_rx_bw(conf->def.width); + bw = min(bw, ieee80211_sta_cap_rx_bw(sta)); + if (bw != sta->sta.bandwidth) { + sta->sta.bandwidth = bw; + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_BW_CHANGED); + /* + * if a TDLS peer BW was updated, we need to + * recalc the chandef width again, to get the + * correct chanctx min_def + */ + ieee80211_recalc_chanctx_chantype(local, ctx); + } + } + } mutex_unlock(&local->chanctx_mtx); } @@ -1350,8 +1378,6 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, break; } - iee80211_tdls_recalc_chanctx(sdata); - mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); if (!sta) { @@ -1360,6 +1386,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, break; } + iee80211_tdls_recalc_chanctx(sdata, sta); iee80211_tdls_recalc_ht_protection(sdata, sta); set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); @@ -1390,7 +1417,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, iee80211_tdls_recalc_ht_protection(sdata, NULL); mutex_unlock(&local->sta_mtx); - iee80211_tdls_recalc_chanctx(sdata); + iee80211_tdls_recalc_chanctx(sdata, NULL); break; default: ret = -ENOTSUPP; diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 0ae207771..b3622823b 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. + * Copyright (C) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -142,15 +143,14 @@ static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx, /* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets * of the IV. Returns pointer to the octet following IVs (i.e., beginning of * the packet payload). */ -u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key) +u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn) { - lockdep_assert_held(&key->u.tkip.txlock); - - pos = write_tkip_iv(pos, key->u.tkip.tx.iv16); - *pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */; - put_unaligned_le32(key->u.tkip.tx.iv32, pos); + pos = write_tkip_iv(pos, TKIP_PN_TO_IV16(pn)); + *pos++ = (keyconf->keyidx << 6) | (1 << 5) /* Ext IV */; + put_unaligned_le32(TKIP_PN_TO_IV32(pn), pos); return pos + 4; } +EXPORT_SYMBOL_GPL(ieee80211_tkip_add_iv); static void ieee80211_compute_tkip_p1k(struct ieee80211_key *key, u32 iv32) { @@ -250,6 +250,7 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm, u8 rc4key[16], keyid, *pos = payload; int res; const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY]; + struct tkip_ctx_rx *rx_ctx = &key->u.tkip.rx[queue]; if (payload_len < 12) return -1; @@ -265,37 +266,36 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm, if ((keyid >> 6) != key->conf.keyidx) return TKIP_DECRYPT_INVALID_KEYIDX; - if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT && - (iv32 < key->u.tkip.rx[queue].iv32 || - (iv32 == key->u.tkip.rx[queue].iv32 && - iv16 <= key->u.tkip.rx[queue].iv16))) + if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT && + (iv32 < rx_ctx->iv32 || + (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16))) return TKIP_DECRYPT_REPLAY; if (only_iv) { res = TKIP_DECRYPT_OK; - key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED; + rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED; goto done; } - if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT || - key->u.tkip.rx[queue].iv32 != iv32) { + if (rx_ctx->ctx.state == TKIP_STATE_NOT_INIT || + rx_ctx->iv32 != iv32) { /* IV16 wrapped around - perform TKIP phase 1 */ - tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32); + tkip_mixing_phase1(tk, &rx_ctx->ctx, ta, iv32); } if (key->local->ops->update_tkip_key && key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && - key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) { + rx_ctx->ctx.state != TKIP_STATE_PHASE1_HW_UPLOADED) { struct ieee80211_sub_if_data *sdata = key->sdata; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(key->sdata->bss, struct ieee80211_sub_if_data, u.ap); drv_update_tkip_key(key->local, sdata, &key->conf, key->sta, - iv32, key->u.tkip.rx[queue].p1k); - key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED; + iv32, rx_ctx->ctx.p1k); + rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED; } - tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key); + tkip_mixing_phase2(tk, &rx_ctx->ctx, iv16, rc4key); res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12); done: diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h index e3ecb659b..a1bcbfbef 100644 --- a/net/mac80211/tkip.h +++ b/net/mac80211/tkip.h @@ -13,8 +13,6 @@ #include <linux/crypto.h> #include "key.h" -u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key); - int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm, struct ieee80211_key *key, struct sk_buff *skb, diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index a6b444277..2b0a17ee9 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -80,7 +80,23 @@ #define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" #define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx - +#define AMPDU_ACTION_ENTRY __field(enum ieee80211_ampdu_mlme_action, \ + ieee80211_ampdu_mlme_action) \ + STA_ENTRY \ + __field(u16, tid) \ + __field(u16, ssn) \ + __field(u8, buf_size) \ + __field(bool, amsdu) \ + __field(u16, timeout) +#define AMPDU_ACTION_ASSIGN STA_NAMED_ASSIGN(params->sta); \ + __entry->tid = params->tid; \ + __entry->ssn = params->ssn; \ + __entry->buf_size = params->buf_size; \ + __entry->amsdu = params->amsdu; \ + __entry->timeout = params->timeout; +#define AMPDU_ACTION_PR_FMT STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d" +#define AMPDU_ACTION_PR_ARG STA_PR_ARG, __entry->tid, __entry->ssn, \ + __entry->buf_size, __entry->amsdu, __entry->timeout /* * Tracing for driver callbacks. @@ -970,38 +986,25 @@ DEFINE_EVENT(local_only_evt, drv_tx_last_beacon, TRACE_EVENT(drv_ampdu_action, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu), + struct ieee80211_ampdu_params *params), - TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu), + TP_ARGS(local, sdata, params), TP_STRUCT__entry( LOCAL_ENTRY - STA_ENTRY - __field(u32, action) - __field(u16, tid) - __field(u16, ssn) - __field(u8, buf_size) - __field(bool, amsdu) VIF_ENTRY + AMPDU_ACTION_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - STA_ASSIGN; - __entry->action = action; - __entry->tid = tid; - __entry->ssn = ssn ? *ssn : 0; - __entry->buf_size = buf_size; - __entry->amsdu = amsdu; + AMPDU_ACTION_ASSIGN; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d", - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, - __entry->tid, __entry->buf_size, __entry->amsdu + LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3311ce0f3..21f660239 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -710,6 +710,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) info->control.short_preamble = txrc.short_preamble; + /* don't ask rate control when rate already injected via radiotap */ + if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT) + return TX_CONTINUE; + if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); @@ -1112,11 +1116,15 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, reset_agg_timer = true; } else { queued = true; + if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { + clear_sta_flag(tx->sta, WLAN_STA_SP); + ps_dbg(tx->sta->sdata, + "STA %pM aid %d: SP frame queued, close the SP w/o telling the peer\n", + tx->sta->sta.addr, tx->sta->sta.aid); + } info->control.vif = &tx->sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; - info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS | - IEEE80211_TX_CTL_NO_PS_BUFFER | - IEEE80211_TX_STATUS_EOSP; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); @@ -1243,7 +1251,8 @@ static void ieee80211_drv_tx(struct ieee80211_local *local, struct txq_info *txqi; u8 ac; - if (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE) + if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || + (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) goto tx_normal; if (!ieee80211_is_data(hdr->frame_control)) @@ -1266,7 +1275,11 @@ static void ieee80211_drv_tx(struct ieee80211_local *local, if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending) netif_stop_subqueue(sdata->dev, ac); - skb_queue_tail(&txqi->queue, skb); + spin_lock_bh(&txqi->queue.lock); + txqi->byte_cnt += skb->len; + __skb_queue_tail(&txqi->queue, skb); + spin_unlock_bh(&txqi->queue.lock); + drv_wake_tx_queue(local, txqi); return; @@ -1294,6 +1307,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (!skb) goto out; + txqi->byte_cnt -= skb->len; + atomic_dec(&sdata->txqs_len[ac]); if (__netif_subqueue_stopped(sdata->dev, ac)) ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]); @@ -1665,15 +1680,24 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, ieee80211_tx(sdata, sta, skb, false); } -static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) +static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, + struct sk_buff *skb) { struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[info->band]; int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, NULL); u16 txflags; + u16 rate = 0; + bool rate_found = false; + u8 rate_retries = 0; + u16 rate_flags = 0; + u8 mcs_known, mcs_flags; + int i; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; @@ -1724,6 +1748,35 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) info->flags |= IEEE80211_TX_CTL_NO_ACK; break; + case IEEE80211_RADIOTAP_RATE: + rate = *iterator.this_arg; + rate_flags = 0; + rate_found = true; + break; + + case IEEE80211_RADIOTAP_DATA_RETRIES: + rate_retries = *iterator.this_arg; + break; + + case IEEE80211_RADIOTAP_MCS: + mcs_known = iterator.this_arg[0]; + mcs_flags = iterator.this_arg[1]; + if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS)) + break; + + rate_found = true; + rate = iterator.this_arg[2]; + rate_flags = IEEE80211_TX_RC_MCS; + + if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI && + mcs_flags & IEEE80211_RADIOTAP_MCS_SGI) + rate_flags |= IEEE80211_TX_RC_SHORT_GI; + + if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && + mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40) + rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + break; + /* * Please update the file * Documentation/networking/mac80211-injection.txt @@ -1738,6 +1791,32 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */ return false; + if (rate_found) { + info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + info->control.rates[i].idx = -1; + info->control.rates[i].flags = 0; + info->control.rates[i].count = 0; + } + + if (rate_flags & IEEE80211_TX_RC_MCS) { + info->control.rates[0].idx = rate; + } else { + for (i = 0; i < sband->n_bitrates; i++) { + if (rate * 5 != sband->bitrates[i].bitrate) + continue; + + info->control.rates[0].idx = i; + break; + } + } + + info->control.rates[0].flags = rate_flags; + info->control.rates[0].count = min_t(u8, rate_retries + 1, + local->hw.max_rate_tries); + } + /* * remove the radiotap header * iterator->_max_length was sanity-checked against @@ -1818,10 +1897,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; - /* process and remove the injection radiotap header */ - if (!ieee80211_parse_tx_radiotap(skb)) - goto fail; - rcu_read_lock(); /* @@ -1883,6 +1958,11 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, goto fail_rcu; info->band = chandef->chan->band; + + /* process and remove the injection radiotap header */ + if (!ieee80211_parse_tx_radiotap(local, skb)) + goto fail_rcu; + ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); @@ -2099,8 +2179,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, mpp_lookup = true; } - if (mpp_lookup) + if (mpp_lookup) { mppath = mpp_path_lookup(sdata, skb->data); + if (mppath) + mppath->exp_time = jiffies; + } if (mppath && mpath) mesh_path_del(mpath->sdata, mpath->dst); @@ -2380,7 +2463,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, /* Update skb pointers to various headers since this modified frame * is going to go through Linux networking code that may potentially * need things like pointer to IP header. */ - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, nh_pos); skb_set_transport_header(skb, h_pos); @@ -3895,9 +3978,9 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, { int ac = ieee802_1d_to_ac[tid & 7]; - skb_set_mac_header(skb, 0); - skb_set_network_header(skb, 0); - skb_set_transport_header(skb, 0); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); skb_set_queue_mapping(skb, ac); skb->priority = tid; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 58f58bd52..7390de494 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015-2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1928,6 +1928,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) BSS_CHANGED_IDLE | BSS_CHANGED_TXPOWER; + if (sdata->vif.mu_mimo_owner) + changed |= BSS_CHANGED_MU_GROUPS; + switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: changed |= BSS_CHANGED_ASSOC | @@ -2371,10 +2374,23 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, switch (chandef->width) { case NL80211_CHAN_WIDTH_160: - vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_160MHZ; + /* + * Convert 160 MHz channel width to new style as interop + * workaround. + */ + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; + vht_oper->center_freq_seg2_idx = vht_oper->center_freq_seg1_idx; + if (chandef->chan->center_freq < chandef->center_freq1) + vht_oper->center_freq_seg1_idx -= 8; + else + vht_oper->center_freq_seg1_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: - vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80P80MHZ; + /* + * Convert 80+80 MHz channel width to new style as interop + * workaround. + */ + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; @@ -2390,17 +2406,13 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } -void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_ht_operation *ht_oper, - struct cfg80211_chan_def *chandef) +bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, + struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; - if (!ht_oper) { - cfg80211_chandef_create(chandef, control_chan, - NL80211_CHAN_NO_HT); - return; - } + if (!ht_oper) + return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: @@ -2414,42 +2426,66 @@ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, break; default: channel_type = NL80211_CHAN_NO_HT; + return false; } - cfg80211_chandef_create(chandef, control_chan, channel_type); + cfg80211_chandef_create(chandef, chandef->chan, channel_type); + return true; } -void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan, - const struct ieee80211_vht_operation *oper, - struct cfg80211_chan_def *chandef) +bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, + struct cfg80211_chan_def *chandef) { + struct cfg80211_chan_def new = *chandef; + int cf1, cf2; + if (!oper) - return; + return false; - chandef->chan = control_chan; + cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, + chandef->chan->band); + cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg2_idx, + chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: break; case IEEE80211_VHT_CHANWIDTH_80MHZ: - chandef->width = NL80211_CHAN_WIDTH_80; + new.width = NL80211_CHAN_WIDTH_80; + new.center_freq1 = cf1; + /* If needed, adjust based on the newer interop workaround. */ + if (oper->center_freq_seg2_idx) { + unsigned int diff; + + diff = abs(oper->center_freq_seg2_idx - + oper->center_freq_seg1_idx); + if (diff == 8) { + new.width = NL80211_CHAN_WIDTH_160; + new.center_freq1 = cf2; + } else if (diff > 8) { + new.width = NL80211_CHAN_WIDTH_80P80; + new.center_freq2 = cf2; + } + } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: - chandef->width = NL80211_CHAN_WIDTH_160; + new.width = NL80211_CHAN_WIDTH_160; + new.center_freq1 = cf1; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - chandef->width = NL80211_CHAN_WIDTH_80P80; + new.width = NL80211_CHAN_WIDTH_80P80; + new.center_freq1 = cf1; + new.center_freq2 = cf2; break; default: - break; + return false; } - chandef->center_freq1 = - ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, - control_chan->band); - chandef->center_freq2 = - ieee80211_channel_to_frequency(oper->center_freq_seg2_idx, - control_chan->band); + if (!cfg80211_chandef_valid(&new)) + return false; + + *chandef = new; + return true; } int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, @@ -2672,6 +2708,18 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, sband = local->hw.wiphy->bands[status->band]; bitrate = sband->bitrates[status->rate_idx].bitrate; ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift)); + + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + /* TODO: handle HT/VHT preambles */ + if (status->band == IEEE80211_BAND_5GHZ) { + ts += 20 << shift; + mpdu_offset += 2; + } else if (status->flag & RX_FLAG_SHORTPRE) { + ts += 96; + } else { + ts += 192; + } + } } rate = cfg80211_calculate_bitrate(&ri); @@ -3357,3 +3405,17 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, txqi->txq.ac = IEEE80211_AC_BE; } } + +void ieee80211_txq_get_depth(struct ieee80211_txq *txq, + unsigned long *frame_cnt, + unsigned long *byte_cnt) +{ + struct txq_info *txqi = to_txq_info(txq); + + if (frame_cnt) + *frame_cnt = txqi->queue.qlen; + + if (byte_cnt) + *byte_cnt = txqi->byte_cnt; +} +EXPORT_SYMBOL(ieee80211_txq_get_depth); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index c38b2f07a..e590e2ef9 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -1,6 +1,9 @@ /* * VHT handling * + * Portions of this file + * Copyright(c) 2015 - 2016 Intel Deutschland GmbH + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -278,6 +281,23 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, } sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + + /* If HT IE reported 3839 bytes only, stay with that size. */ + if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839) + return; + + switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + default: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; + break; + } } enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) @@ -299,7 +319,30 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) return IEEE80211_STA_RX_BW_80; } -static enum ieee80211_sta_rx_bandwidth +enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) +{ + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + u32 cap_width; + + if (!vht_cap->vht_supported) { + if (!sta->sta.ht_cap.ht_supported) + return NL80211_CHAN_WIDTH_20_NOHT; + + return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; + } + + cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; + + if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) + return NL80211_CHAN_WIDTH_160; + else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + return NL80211_CHAN_WIDTH_80P80; + + return NL80211_CHAN_WIDTH_80; +} + +enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { switch (width) { @@ -327,10 +370,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); - - /* do not cap the BW of TDLS WIDER_BW peers by the bss */ - if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) - bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } @@ -425,6 +465,43 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, return changed; } +void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt) +{ + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + + if (!sdata->vif.mu_mimo_owner) + return; + + if (!memcmp(mgmt->u.action.u.vht_group_notif.position, + bss_conf->mu_group.position, WLAN_USER_POSITION_LEN) && + !memcmp(mgmt->u.action.u.vht_group_notif.membership, + bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) + return; + + memcpy(bss_conf->mu_group.membership, + mgmt->u.action.u.vht_group_notif.membership, + WLAN_MEMBERSHIP_LEN); + memcpy(bss_conf->mu_group.position, + mgmt->u.action.u.vht_group_notif.position, + WLAN_USER_POSITION_LEN); + + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS); +} + +void ieee80211_update_mu_groups(struct ieee80211_vif *vif, + const u8 *membership, const u8 *position) +{ + struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; + + if (WARN_ON_ONCE(!vif->mu_mimo_owner)) + return; + + memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN); + memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN); +} +EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); + void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum ieee80211_band band) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index d824c3897..18848258a 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2008, Jouni Malinen <j@w1.fi> + * Copyright (C) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -183,7 +184,6 @@ mic_fail_no_key: return RX_DROP_UNUSABLE; } - static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; @@ -191,6 +191,7 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; int len, tail; + u64 pn; u8 *pos; if (info->control.hw_key && @@ -222,12 +223,8 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) return 0; /* Increase IV for the frame */ - spin_lock(&key->u.tkip.txlock); - key->u.tkip.tx.iv16++; - if (key->u.tkip.tx.iv16 == 0) - key->u.tkip.tx.iv32++; - pos = ieee80211_tkip_add_iv(pos, key); - spin_unlock(&key->u.tkip.txlock); + pn = atomic64_inc_return(&key->conf.tx_pn); + pos = ieee80211_tkip_add_iv(pos, &key->conf, pn); /* hwaccel - with software IV */ if (info->control.hw_key) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index a13d02b7c..6a3e1c218 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -17,9 +17,9 @@ #include <linux/err.h> #include <linux/bug.h> #include <linux/completion.h> -#include <linux/crypto.h> #include <linux/ieee802154.h> #include <crypto/aead.h> +#include <crypto/skcipher.h> #include "ieee802154_i.h" #include "llsec.h" @@ -144,18 +144,18 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) goto err_tfm; } - key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); + key->tfm0 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(key->tfm0)) goto err_tfm; - if (crypto_blkcipher_setkey(key->tfm0, template->key, - IEEE802154_LLSEC_KEY_SIZE)) + if (crypto_skcipher_setkey(key->tfm0, template->key, + IEEE802154_LLSEC_KEY_SIZE)) goto err_tfm0; return key; err_tfm0: - crypto_free_blkcipher(key->tfm0); + crypto_free_skcipher(key->tfm0); err_tfm: for (i = 0; i < ARRAY_SIZE(key->tfm); i++) if (key->tfm[i]) @@ -175,7 +175,7 @@ static void llsec_key_release(struct kref *ref) for (i = 0; i < ARRAY_SIZE(key->tfm); i++) crypto_free_aead(key->tfm[i]); - crypto_free_blkcipher(key->tfm0); + crypto_free_skcipher(key->tfm0); kzfree(key); } @@ -620,15 +620,17 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, { u8 iv[16]; struct scatterlist src; - struct blkcipher_desc req = { - .tfm = key->tfm0, - .info = iv, - .flags = 0, - }; + SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); + int err; llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); sg_init_one(&src, skb->data, skb->len); - return crypto_blkcipher_encrypt_iv(&req, &src, &src, skb->len); + skcipher_request_set_tfm(req, key->tfm0); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &src, &src, skb->len, iv); + err = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); + return err; } static struct crypto_aead* @@ -830,11 +832,8 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, unsigned char *data; int datalen; struct scatterlist src; - struct blkcipher_desc req = { - .tfm = key->tfm0, - .info = iv, - .flags = 0, - }; + SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); + int err; llsec_geniv(iv, dev_addr, &hdr->sec); data = skb_mac_header(skb) + skb->mac_len; @@ -842,7 +841,13 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, sg_init_one(&src, data, datalen); - return crypto_blkcipher_decrypt_iv(&req, &src, &src, datalen); + skcipher_request_set_tfm(req, key->tfm0); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &src, &src, datalen, iv); + + err = crypto_skcipher_decrypt(req); + skcipher_request_zero(req); + return err; } static int diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h index 950578e1d..6f3b658e3 100644 --- a/net/mac802154/llsec.h +++ b/net/mac802154/llsec.h @@ -19,7 +19,6 @@ #include <linux/slab.h> #include <linux/hashtable.h> -#include <linux/crypto.h> #include <linux/kref.h> #include <linux/spinlock.h> #include <net/af_ieee802154.h> @@ -30,7 +29,7 @@ struct mac802154_llsec_key { /* one tfm for each authsize (4/8/16) */ struct crypto_aead *tfm[3]; - struct crypto_blkcipher *tfm0; + struct crypto_skcipher *tfm0; struct kref ref; }; diff --git a/net/mac802154/main.c b/net/mac802154/main.c index e8cab5bb8..87da85ae5 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -218,7 +218,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw) tasklet_kill(&local->tasklet); flush_workqueue(local->workqueue); - destroy_workqueue(local->workqueue); rtnl_lock(); @@ -226,6 +225,7 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw) rtnl_unlock(); + destroy_workqueue(local->workqueue); wpan_phy_unregister(local->phy); } EXPORT_SYMBOL(ieee802154_unregister_hw); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index fb31aa87d..644a8da6d 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -227,5 +227,6 @@ static void __exit mpls_iptunnel_exit(void) } module_exit(mpls_iptunnel_exit); +MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2"); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index b0bc475f6..2e8e7e5fb 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -95,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 29dde2083..9a065f672 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -267,6 +267,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) { + if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN) + return -IPSET_ERR_PROTOCOL; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); e.add_mac = 1; } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 95db43fc0..a748b0c2c 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -497,6 +497,26 @@ __ip_set_put(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } +/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need + * a separate reference counter + */ +static inline void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + +static inline void +__ip_set_put_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + BUG_ON(set->ref_netlink == 0); + set->ref_netlink--; + write_unlock_bh(&ip_set_ref_lock); +} + /* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced @@ -985,6 +1005,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; + /* Must wait for flush to be really finished in list:set */ + rcu_barrier(); + /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call @@ -999,7 +1022,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s && s->ref) { + if (s && (s->ref || s->ref_netlink)) { ret = -IPSET_ERR_BUSY; goto out; } @@ -1021,7 +1044,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (!s) { ret = -ENOENT; goto out; - } else if (s->ref) { + } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; goto out; } @@ -1168,6 +1191,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; + if (from->ref_netlink || to->ref_netlink) + return -EBUSY; + strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); @@ -1203,7 +1229,7 @@ ip_set_dump_done(struct netlink_callback *cb) if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); - __ip_set_put_byindex(inst, index); + __ip_set_put_netlink(set); } return 0; } @@ -1325,7 +1351,7 @@ dump_last: if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - set->ref++; + set->ref_netlink++; } write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, @@ -1393,7 +1419,7 @@ release_refcount: if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); - __ip_set_put_byindex(inst, index); + __ip_set_put_netlink(set); cb->args[IPSET_CB_ARG0] = 0; } out: diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index e5336ab36..d32fd6b03 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1082,7 +1082,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) goto nla_put_failure; #endif - if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index f1e7d2c0f..8f004edad 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - if (unlikely(!tb[IPSET_ATTR_ETHER])) + if (unlikely(!tb[IPSET_ATTR_ETHER] || + nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bbede95c9..a2a89e4e0 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set"); struct set_elem { struct rcu_head rcu; struct list_head list; + struct ip_set *set; /* Sigh, in order to cleanup reference */ ip_set_id_t id; } __aligned(__alignof__(u64)); @@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb, /* Userspace interfaces: we are protected by the nfnl mutex */ static void -__list_set_del(struct ip_set *set, struct set_elem *e) +__list_set_del_rcu(struct rcu_head * rcu) { + struct set_elem *e = container_of(rcu, struct set_elem, rcu); + struct ip_set *set = e->set; struct list_set *map = set->data; ip_set_put_byindex(map->net, e->id); - /* We may call it, because we don't have a to be destroyed - * extension which is used by the kernel. - */ ip_set_ext_destroy(set, e); - kfree_rcu(e, rcu); + kfree(e); } static inline void list_set_del(struct ip_set *set, struct set_elem *e) { list_del_rcu(&e->list); - __list_set_del(set, e); + call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) +list_set_replace(struct set_elem *e, struct set_elem *old) { list_replace_rcu(&old->list, &e->list); - __list_set_del(set, old); + call_rcu(&old->rcu, __list_set_del_rcu); } static void @@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; - if (SET_WITH_TIMEOUT(set)) - set_cleanup_entries(set); - /* Find where to add the new entry */ n = prev = next = NULL; list_for_each_entry(e, &map->members, list) { @@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!e) return -ENOMEM; e->id = d->id; + e->set = set; INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(set, e, n); + list_set_replace(e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); + list_for_each_entry_safe(e, n, &map->members, list) { list_del(&e->list); ip_set_put_byindex(map->net, e->id); @@ -450,14 +449,16 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) struct set_elem *e; u32 n = 0; - list_for_each_entry(e, &map->members, list) + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) n++; + rcu_read_unlock(); nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + n * set->dsize))) goto nla_put_failure; @@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set, atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - list_for_each_entry(e, &map->members, list) { - if (i == first) - break; - i++; - } rcu_read_lock(); - list_for_each_entry_from(e, &map->members, list) { - i++; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + list_for_each_entry_rcu(e, &map->members, list) { + if (i < first || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set)))) { + i++; continue; + } nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (i == first) { - nla_nest_cancel(skb, atd); - ret = -EMSGSIZE; - goto out; - } + if (!nested) goto nla_put_failure; - } if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(map->net, e->id))) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; ipset_nest_end(skb, nested); + i++; } ipset_nest_end(skb, atd); @@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set, nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { + nla_nest_cancel(skb, atd); cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; + } else { + cb->args[IPSET_CB_ARG0] = i; } - cb->args[IPSET_CB_ARG0] = i - 1; ipset_nest_end(skb, atd); out: rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 0328f7250..299edc6ad 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -605,17 +605,13 @@ static const struct file_operations ip_vs_app_fops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - INIT_LIST_HEAD(&ipvs->app_list); - proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); + proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops); return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - unregister_ip_vs_app(ipvs, NULL /* all */); - remove_proc_entry("ip_vs_app", net->proc_net); + remove_proc_entry("ip_vs_app", ipvs->net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4da560005..b9a4082af 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1089,6 +1089,7 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, switch (cp->protocol) { case IPPROTO_TCP: return (cp->state == IP_VS_TCP_S_TIME_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE) || ((conn_reuse_mode & 2) && (cp->state == IP_VS_TCP_S_FIN_WAIT) && (cp->flags & IP_VS_CONN_F_NOOUTPUT)); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e7c1b052c..404b2a4f4 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1376,8 +1376,6 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; - pr_info("%s: enter\n", __func__); - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services--; @@ -3947,7 +3945,6 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; int i, idx; /* Initialize rs_table */ @@ -3974,9 +3971,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) spin_lock_init(&ipvs->tot_stats.lock); - proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); - proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); - proc_create("ip_vs_stats_percpu", 0, net->proc_net, + proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net, &ip_vs_stats_percpu_fops); if (ip_vs_control_net_init_sysctl(ipvs)) @@ -3991,13 +3988,11 @@ err: void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); - remove_proc_entry("ip_vs_stats_percpu", net->proc_net); - remove_proc_entry("ip_vs_stats", net->proc_net); - remove_proc_entry("ip_vs", net->proc_net); + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + remove_proc_entry("ip_vs", ipvs->net->proc_net); free_percpu(ipvs->tot_stats.cpustats); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3264cb49b..dc196a0f5 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -531,8 +531,6 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); - if (!skb->sk) - skb_sender_cpu_clear(skb); } return ret; } @@ -573,8 +571,6 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); - if (!skb->sk) - skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -595,8 +591,6 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); - if (!skb->sk) - skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -1019,8 +1013,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -1112,8 +1105,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)); if (IS_ERR(skb)) goto tx_error; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f60b4fdee..e27fd17c6 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); -static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -74,8 +74,7 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) spin_lock(lock); while (unlikely(nf_conntrack_locks_all)) { spin_unlock(lock); - spin_lock(&nf_conntrack_locks_all_lock); - spin_unlock(&nf_conntrack_locks_all_lock); + spin_unlock_wait(&nf_conntrack_locks_all_lock); spin_lock(lock); } } @@ -121,8 +120,7 @@ static void nf_conntrack_all_lock(void) nf_conntrack_locks_all = true; for (i = 0; i < CONNTRACK_LOCKS; i++) { - spin_lock(&nf_conntrack_locks[i]); - spin_unlock(&nf_conntrack_locks[i]); + spin_unlock_wait(&nf_conntrack_locks[i]); } } @@ -1780,6 +1778,7 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { + static atomic64_t unique_id; int ret = -ENOMEM; int cpu; @@ -1802,7 +1801,8 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu", + (u64)atomic64_inc_return(&unique_id)); if (!net->ct.slabname) goto err_slabname; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 278f3b935..7cc1d9c22 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -410,6 +410,8 @@ static void tcp_options(const struct sk_buff *skb, length--; continue; default: + if (length < 2) + return; opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; @@ -470,6 +472,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, length--; continue; default: + if (length < 2) + return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 8414ee1a0..7ec697239 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -31,7 +31,6 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) skb_push(skb, skb->mac_len); skb->dev = dev; - skb_sender_cpu_clear(skb); dev_queue_xmit(skb); } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 857ae8963..2278d9ab7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -127,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) -{ - return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); -} -EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); - int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5274b04c4..dbd0803b1 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -96,6 +96,8 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, return -EINVAL; if (flags & NFACCT_F_OVERQUOTA) return -EINVAL; + if ((flags & NFACCT_F_QUOTA) && !tb[NFACCT_QUOTA]) + return -EINVAL; size += sizeof(u64); } @@ -242,6 +244,9 @@ nfacct_filter_alloc(const struct nlattr * const attr) if (err < 0) return ERR_PTR(err); + if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) + return ERR_PTR(-EINVAL); + filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); if (!filter) return ERR_PTR(-ENOMEM); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 8ca932057..11f81c838 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -330,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC); + skb = alloc_skb(n, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = nfnetlink_alloc_skb(net, pkt_size, - peer_portid, GFP_ATOMIC); + skb = alloc_skb(pkt_size, GFP_ATOMIC); } } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1d3936587..cb5b630a6 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0, rem_len = 0; + size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -361,7 +361,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; - rem_len = data_len - hlen; break; } @@ -386,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, - GFP_ATOMIC); + skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); return NULL; @@ -584,7 +582,12 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); if (err < 0) { - queue->queue_user_dropped++; + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_user_dropped++; + } goto err_out_unlock; } diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 454841baa..6228c422c 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -660,6 +660,9 @@ nft_match_select_ops(const struct nft_ctx *ctx, if (IS_ERR(match)) return ERR_PTR(-ENOENT); + if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) + return ERR_PTR(-EINVAL); + /* This is the first time we use this match, allocate operations */ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); if (nft_match == NULL) @@ -740,6 +743,9 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (IS_ERR(target)) return ERR_PTR(-ENOENT); + if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) + return ERR_PTR(-EINVAL); + /* This is the first time we use this target, allocate operations */ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); if (nft_target == NULL) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 9aea747b4..81b5ad616 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -17,7 +17,9 @@ #include <net/netfilter/nft_masq.h> const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_masq_policy); @@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); struct nft_masq *priv = nft_expr_priv(expr); int err; @@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx, if (err) return err; - if (tb[NFTA_MASQ_FLAGS] == NULL) - return 0; - - priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + if (tb[NFTA_MASQ_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + + if (tb[NFTA_MASQ_REG_PROTO_MIN]) { + priv->sreg_proto_min = + nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); + + err = nft_validate_register_load(priv->sreg_proto_min, plen); + if (err < 0) + return err; + + if (tb[NFTA_MASQ_REG_PROTO_MAX]) { + priv->sreg_proto_max = + nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); + + err = nft_validate_register_load(priv->sreg_proto_max, + plen); + if (err < 0) + return err; + } else { + priv->sreg_proto_max = priv->sreg_proto_min; + } + } return 0; } @@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_masq *priv = nft_expr_priv(expr); - if (priv->flags == 0) - return 0; - - if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + if (priv->flags != 0 && + nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN, + priv->sreg_proto_min) || + nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX, + priv->sreg_proto_max)) + goto nla_put_failure; + } + return 0; nla_put_failure: diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index fe885bf27..16c50b0dd 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -28,6 +28,8 @@ #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif + case NFT_META_PRANDOM: { + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + *dest = prandom_u32_state(state); + break; + } default: WARN_ON(1); goto err; @@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: len = IFNAMSIZ; break; + case NFT_META_PRANDOM: + prandom_init_once(&nft_prandom_state); + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c8a0b7da5..582c9cfd6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -659,6 +659,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; + if (sz < sizeof(*info)) + return NULL; + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; @@ -694,12 +697,45 @@ EXPORT_SYMBOL(xt_free_table_info); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { - struct xt_table *t; + struct xt_table *t, *found = NULL; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &net->xt.tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; + + if (net == &init_net) + goto out; + + /* Table doesn't exist in this netns, re-try init */ + list_for_each_entry(t, &init_net.xt.tables[af], list) { + if (strcmp(t->name, name)) + continue; + if (!try_module_get(t->me)) + return NULL; + + mutex_unlock(&xt[af].mutex); + if (t->table_init(net) != 0) { + module_put(t->me); + return NULL; + } + + found = t; + + mutex_lock(&xt[af].mutex); + break; + } + + if (!found) + goto out; + + /* and once again: */ + list_for_each_entry(t, &net->xt.tables[af], list) + if (strcmp(t->name, name) == 0) + return t; + + module_put(found->me); + out: mutex_unlock(&xt[af].mutex); return NULL; } @@ -1170,20 +1206,20 @@ static const struct file_operations xt_target_ops = { #endif /* CONFIG_PROC_FS */ /** - * xt_hook_link - set up hooks for a new table + * xt_hook_ops_alloc - set up hooks for a new table * @table: table with metadata needed to set up hooks * @fn: Hook function * - * This function will take care of creating and registering the necessary - * Netfilter hooks for XT tables. + * This function will create the nf_hook_ops that the x_table needs + * to hand to xt_hook_link_net(). */ -struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) +struct nf_hook_ops * +xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops; - int ret; ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) @@ -1200,27 +1236,9 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) ++i; } - ret = nf_register_hooks(ops, num_hooks); - if (ret < 0) { - kfree(ops); - return ERR_PTR(ret); - } - return ops; } -EXPORT_SYMBOL_GPL(xt_hook_link); - -/** - * xt_hook_unlink - remove hooks for a table - * @ops: nf_hook_ops array as returned by nf_hook_link - * @hook_mask: the very same mask that was passed to nf_hook_link - */ -void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops) -{ - nf_unregister_hooks(ops, hweight32(table->valid_hooks)); - kfree(ops); -} -EXPORT_SYMBOL_GPL(xt_hook_unlink); +EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); int xt_proto_init(struct net *net, u_int8_t af) { diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 29d2c31f4..daf45da44 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -236,6 +236,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) list_del(&info->timer->entry); del_timer_sync(&info->timer->timer); + cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 3ab591e73..7f4414d26 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) * belonging to established connections going through that one. */ static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet_lookup_listener(net, &tcp_hashinfo, skb, + ip_hdrlen(skb) + + __tcp_hdrlen(tcph), saddr, sport, daddr, dport, in->ifindex); @@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, #ifdef XT_TPROXY_HAVE_IPV6 static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + thoff + __tcp_hdrlen(tcph), saddr, sport, daddr, ntohs(dport), in->ifindex); @@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); @@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(par->net, tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, @@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, - &iph->saddr, laddr, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, + tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 4e3c3affd..2455b69b5 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -262,7 +262,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) if (f->opt[optnum].kind == (*optp)) { __u32 len = f->opt[optnum].length; const __u8 *optend = optp + len; - int loop_cont = 0; fmatch = FMATCH_OK; @@ -275,7 +274,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: - loop_cont = 1; break; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2ec08f04b..49d14ecad 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb, * box. */ static struct sock * -xt_socket_get_sock_v4(struct net *net, const u8 protocol, +xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, + return __inet_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -148,6 +149,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); u8 uninitialized_var(protocol); @@ -169,6 +172,10 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, sport = hp->source; daddr = iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, @@ -198,8 +205,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, } #endif - return xt_socket_get_sock_v4(net, protocol, saddr, daddr, - sport, dport, indev); + return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, + daddr, sport, dport, indev); } static bool @@ -318,14 +325,15 @@ extract_icmp6_fields(const struct sk_buff *skb, } static struct sock * -xt_socket_get_sock_v6(struct net *net, const u8 protocol, +xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, &tcp_hashinfo, + return inet6_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -343,6 +351,8 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; int thoff = 0, tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); @@ -362,6 +372,10 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, sport = hp->source; daddr = &iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = tproto == IPPROTO_TCP ? + thoff + __tcp_hdrlen((struct tcphdr *)hp) : + thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { struct ipv6hdr ipv6_var; @@ -373,7 +387,7 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, return NULL; } - return xt_socket_get_sock_v6(net, tproto, saddr, daddr, + return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f0cb92f3d..ada674222 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -55,8 +55,8 @@ struct netlbl_domhsh_tbl { static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) -static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; -static struct netlbl_dom_map *netlbl_domhsh_def = NULL; +static struct netlbl_domhsh_tbl *netlbl_domhsh; +static struct netlbl_dom_map *netlbl_domhsh_def; /* * Domain Hash Table Helper Functions diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index b0380927f..9eaa9a1e8 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -116,11 +116,11 @@ struct netlbl_unlhsh_walk_arg { static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) -static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; -static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; +static struct netlbl_unlhsh_tbl *netlbl_unlhsh; +static struct netlbl_unlhsh_iface *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ -static u8 netlabel_unlabel_acceptflg = 0; +static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family = { diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig index 2c5e95e9b..5d6e8c05b 100644 --- a/net/netlink/Kconfig +++ b/net/netlink/Kconfig @@ -2,15 +2,6 @@ # Netlink Sockets # -config NETLINK_MMAP - bool "NETLINK: mmaped IO" - ---help--- - This option enables support for memory mapped netlink IO. This - reduces overhead by avoiding copying data between kernel- and - userspace. - - If unsure, say N. - config NETLINK_DIAG tristate "NETLINK: socket monitoring interface" default n diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d2bc03f0b..330ebd600 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, dev_hold(dev); - if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk) wake_up_interruptible(&nlk->wait); } -#ifdef CONFIG_NETLINK_MMAP -static bool netlink_rx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->rx_ring.pg_vec != NULL; -} - -static bool netlink_tx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->tx_ring.pg_vec != NULL; -} - -static __pure struct page *pgvec_to_page(const void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - else - return virt_to_page(addr); -} - -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; i++) { - if (pg_vec[i] != NULL) { - if (is_vmalloc_addr(pg_vec[i])) - vfree(pg_vec[i]); - else - free_pages((unsigned long)pg_vec[i], order); - } - } - kfree(pg_vec); -} - -static void *alloc_one_pg_vec_page(unsigned long order) -{ - void *buffer; - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | - __GFP_NOWARN | __GFP_NORETRY; - - buffer = (void *)__get_free_pages(gfp_flags, order); - if (buffer != NULL) - return buffer; - - buffer = vzalloc((1 << order) * PAGE_SIZE); - if (buffer != NULL) - return buffer; - - gfp_flags &= ~__GFP_NORETRY; - return (void *)__get_free_pages(gfp_flags, order); -} - -static void **alloc_pg_vec(struct netlink_sock *nlk, - struct nl_mmap_req *req, unsigned int order) -{ - unsigned int block_nr = req->nm_block_nr; - unsigned int i; - void **pg_vec; - - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); - if (pg_vec == NULL) - return NULL; - - for (i = 0; i < block_nr; i++) { - pg_vec[i] = alloc_one_pg_vec_page(order); - if (pg_vec[i] == NULL) - goto err1; - } - - return pg_vec; -err1: - free_pg_vec(pg_vec, order, block_nr); - return NULL; -} - - -static void -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, - unsigned int order) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct sk_buff_head *queue; - struct netlink_ring *ring; - - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); -} - -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool tx_ring) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - void **pg_vec = NULL; - unsigned int order = 0; - - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - - if (req->nm_block_nr) { - if (ring->pg_vec != NULL) - return -EBUSY; - - if ((int)req->nm_block_size <= 0) - return -EINVAL; - if (!PAGE_ALIGNED(req->nm_block_size)) - return -EINVAL; - if (req->nm_frame_size < NL_MMAP_HDRLEN) - return -EINVAL; - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) - return -EINVAL; - - ring->frames_per_block = req->nm_block_size / - req->nm_frame_size; - if (ring->frames_per_block == 0) - return -EINVAL; - if (ring->frames_per_block * req->nm_block_nr != - req->nm_frame_nr) - return -EINVAL; - - order = get_order(req->nm_block_size); - pg_vec = alloc_pg_vec(nlk, req, order); - if (pg_vec == NULL) - return -ENOMEM; - } else { - if (req->nm_frame_nr) - return -EINVAL; - } - - mutex_lock(&nlk->pg_vec_lock); - if (atomic_read(&nlk->mapped) == 0) { - __netlink_set_ring(sk, req, tx_ring, pg_vec, order); - mutex_unlock(&nlk->pg_vec_lock); - return 0; - } - - mutex_unlock(&nlk->pg_vec_lock); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); - - return -EBUSY; -} - -static void netlink_mm_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_inc(&nlk_sk(sk)->mapped); -} - -static void netlink_mm_close(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_dec(&nlk_sk(sk)->mapped); -} - -static const struct vm_operations_struct netlink_mmap_ops = { - .open = netlink_mm_open, - .close = netlink_mm_close, -}; - -static int netlink_mmap(struct file *file, struct socket *sock, - struct vm_area_struct *vma) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - unsigned long start, size, expected; - unsigned int i; - int err = -EINVAL; - - if (vma->vm_pgoff) - return -EINVAL; - - mutex_lock(&nlk->pg_vec_lock); - - expected = 0; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; - } - - if (expected == 0) - goto out; - - size = vma->vm_end - vma->vm_start; - if (size != expected) - goto out; - - start = vma->vm_start; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - - for (i = 0; i < ring->pg_vec_len; i++) { - struct page *page; - void *kaddr = ring->pg_vec[i]; - unsigned int pg_num; - - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { - page = pgvec_to_page(kaddr); - err = vm_insert_page(vma, start, page); - if (err < 0) - goto out; - start += PAGE_SIZE; - kaddr += PAGE_SIZE; - } - } - } - - atomic_inc(&nlk->mapped); - vma->vm_ops = &netlink_mmap_ops; - err = 0; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) -{ -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 - struct page *p_start, *p_end; - - /* First page is flushed through netlink_{get,set}_status */ - p_start = pgvec_to_page(hdr + PAGE_SIZE); - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } -#endif -} - -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) -{ - smp_rmb(); - flush_dcache_page(pgvec_to_page(hdr)); - return hdr->nm_status; -} - -static void netlink_set_status(struct nl_mmap_hdr *hdr, - enum nl_mmap_status status) -{ - smp_mb(); - hdr->nm_status = status; - flush_dcache_page(pgvec_to_page(hdr)); -} - -static struct nl_mmap_hdr * -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) -{ - unsigned int pg_vec_pos, frame_off; - - pg_vec_pos = pos / ring->frames_per_block; - frame_off = pos % ring->frames_per_block; - - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); -} - -static struct nl_mmap_hdr * -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, - enum nl_mmap_status status) -{ - struct nl_mmap_hdr *hdr; - - hdr = __netlink_lookup_frame(ring, pos); - if (netlink_get_status(hdr) != status) - return NULL; - - return hdr; -} - -static struct nl_mmap_hdr * -netlink_current_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - return netlink_lookup_frame(ring, ring->head, status); -} - -static void netlink_increment_head(struct netlink_ring *ring) -{ - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; -} - -static void netlink_forward_ring(struct netlink_ring *ring) -{ - unsigned int head = ring->head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, ring->head); - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) - break; - if (hdr->nm_status != NL_MMAP_STATUS_SKIP) - break; - netlink_increment_head(ring); - } while (ring->head != head); -} - -static bool netlink_has_valid_frame(struct netlink_ring *ring) -{ - unsigned int head = ring->head, pos = head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, pos); - if (hdr->nm_status == NL_MMAP_STATUS_VALID) - return true; - pos = pos != 0 ? pos - 1 : ring->frame_max; - } while (pos != head); - - return false; -} - -static bool netlink_dump_space(struct netlink_sock *nlk) -{ - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - unsigned int n; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - return false; - - n = ring->head + ring->frame_max / 2; - if (n > ring->frame_max) - n -= ring->frame_max; - - hdr = __netlink_lookup_frame(ring, n); - - return hdr->nm_status == NL_MMAP_STATUS_UNUSED; -} - -static unsigned int netlink_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int mask; - int err; - - if (nlk->rx_ring.pg_vec != NULL) { - /* Memory mapped sockets don't call recvmsg(), so flow control - * for dumps is performed here. A dump is allowed to continue - * if at least half the ring is unused. - */ - while (nlk->cb_running && netlink_dump_space(nlk)) { - err = netlink_dump(sk); - if (err < 0) { - sk->sk_err = -err; - sk->sk_error_report(sk); - break; - } - } - netlink_rcv_wake(sk); - } - - mask = datagram_poll(file, sock, wait); - - /* We could already have received frames in the normal receive - * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, - * so if mask contains pollin/etc already, there's no point - * walking the ring. - */ - if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - if (netlink_has_valid_frame(&nlk->rx_ring)) - mask |= POLLIN | POLLRDNORM; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } - - spin_lock_bh(&sk->sk_write_queue.lock); - if (nlk->tx_ring.pg_vec) { - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLOUT | POLLWRNORM; - } - spin_unlock_bh(&sk->sk_write_queue.lock); - - return mask; -} - -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) -{ - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); -} - -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, - struct netlink_ring *ring, - struct nl_mmap_hdr *hdr) -{ - unsigned int size; - void *data; - - size = ring->frame_size - NL_MMAP_HDRLEN; - data = (void *)hdr + NL_MMAP_HDRLEN; - - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->len = 0; - - skb->destructor = netlink_skb_destructor; - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; - NETLINK_CB(skb).sk = sk; -} - -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, - u32 dst_portid, u32 dst_group, - struct scm_cookie *scm) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - struct sk_buff *skb; - unsigned int maxlen; - int err = 0, len = 0; - - mutex_lock(&nlk->pg_vec_lock); - - ring = &nlk->tx_ring; - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - - do { - unsigned int nm_len; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); - if (hdr == NULL) { - if (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending)) - schedule(); - continue; - } - - nm_len = ACCESS_ONCE(hdr->nm_len); - if (nm_len > maxlen) { - err = -EINVAL; - goto out; - } - - netlink_frame_flush_dcache(hdr, nm_len); - - skb = alloc_skb(nm_len, GFP_KERNEL); - if (skb == NULL) { - err = -ENOBUFS; - goto out; - } - __skb_put(skb, nm_len); - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - - netlink_increment_head(ring); - - NETLINK_CB(skb).portid = nlk->portid; - NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).creds = scm->creds; - - err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; - } - - if (unlikely(dst_group)) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_portid, dst_group, - GFP_KERNEL); - } - err = netlink_unicast(sk, skb, dst_portid, - msg->msg_flags & MSG_DONTWAIT); - if (err < 0) - goto out; - len += err; - - } while (hdr != NULL || - (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending))); - - if (len > 0) - err = len; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) -{ - struct nl_mmap_hdr *hdr; - - hdr = netlink_mmap_hdr(skb); - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_frame_flush_dcache(hdr, hdr->nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; - kfree_skb(skb); -} - -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - - spin_lock_bh(&sk->sk_receive_queue.lock); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - kfree_skb(skb); - netlink_overrun(sk); - return; - } - netlink_increment_head(ring); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_set_status(hdr, NL_MMAP_STATUS_COPY); -} - -#else /* CONFIG_NETLINK_MMAP */ -#define netlink_rx_is_mmaped(sk) false -#define netlink_tx_is_mmaped(sk) false -#define netlink_mmap sock_no_mmap -#define netlink_poll datagram_poll -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0 -#endif /* CONFIG_NETLINK_MMAP */ - static void netlink_skb_destructor(struct sk_buff *skb) { -#ifdef CONFIG_NETLINK_MMAP - struct nl_mmap_hdr *hdr; - struct netlink_ring *ring; - struct sock *sk; - - /* If a packet from the kernel to userspace was freed because of an - * error without being delivered to userspace, the kernel must reset - * the status. In the direction userspace to kernel, the status is - * always reset here after the packet was processed and freed. - */ - if (netlink_skb_is_mmaped(skb)) { - hdr = netlink_mmap_hdr(skb); - sk = NETLINK_CB(skb).sk; - - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - ring = &nlk_sk(sk)->tx_ring; - } else { - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { - hdr->nm_len = 0; - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - } - ring = &nlk_sk(sk)->rx_ring; - } - - WARN_ON(atomic_read(&ring->pending) == 0); - atomic_dec(&ring->pending); - sock_put(sk); - - skb->head = NULL; - } -#endif if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) @@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); -#ifdef CONFIG_NETLINK_MMAP - if (1) { - struct nl_mmap_req req; - - memset(&req, 0, sizeof(req)); - if (nlk->rx_ring.pg_vec) - __netlink_set_ring(sk, &req, false, NULL, 0); - memset(&req, 0, sizeof(req)); - if (nlk->tx_ring.pg_vec) - __netlink_set_ring(sk, &req, true, NULL, 0); - } -#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); -#ifdef CONFIG_NETLINK_MMAP - mutex_init(&nlk->pg_vec_lock); -#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -1650,6 +1033,14 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, return 0; } +static int netlink_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* try to hand this ioctl down to the NIC drivers. + */ + return -ENOIOCTLCMD; +} + static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; @@ -1728,8 +1119,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_S_CONGESTED, &nlk->state)) && - !netlink_skb_is_mmaped(skb)) { + test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -1767,14 +1157,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) netlink_deliver_tap(skb); -#ifdef CONFIG_NETLINK_MMAP - if (netlink_skb_is_mmaped(skb)) - netlink_queue_mmaped_skb(sk, skb); - else if (netlink_rx_is_mmaped(sk)) - netlink_ring_set_copied(sk, skb); - else -#endif /* CONFIG_NETLINK_MMAP */ - skb_queue_tail(&sk->sk_receive_queue, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } @@ -1798,9 +1181,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) int delta; WARN_ON(skb->sk != NULL); - if (netlink_skb_is_mmaped(skb)) - return skb; - delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; @@ -1876,79 +1256,6 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, - unsigned int ldiff, u32 dst_portid, - gfp_t gfp_mask) -{ -#ifdef CONFIG_NETLINK_MMAP - unsigned int maxlen, linear_size; - struct sock *sk = NULL; - struct sk_buff *skb; - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - - sk = netlink_getsockbyportid(ssk, dst_portid); - if (IS_ERR(sk)) - goto out; - - ring = &nlk_sk(sk)->rx_ring; - /* fast-path without atomic ops for common case: non-mmaped receiver */ - if (ring->pg_vec == NULL) - goto out_put; - - /* We need to account the full linear size needed as a ring - * slot cannot have non-linear parts. - */ - linear_size = size + ldiff; - if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) - goto out_put; - - skb = alloc_skb_head(gfp_mask); - if (skb == NULL) - goto err1; - - spin_lock_bh(&sk->sk_receive_queue.lock); - /* check again under lock */ - if (ring->pg_vec == NULL) - goto out_free; - - /* check again under lock */ - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < linear_size) - goto out_free; - - netlink_forward_ring(ring); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - goto err2; - - netlink_ring_setup_skb(skb, sk, ring, hdr); - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); - atomic_inc(&ring->pending); - netlink_increment_head(ring); - - spin_unlock_bh(&sk->sk_receive_queue.lock); - return skb; - -err2: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - netlink_overrun(sk); -err1: - sock_put(sk); - return NULL; - -out_free: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); -out_put: - sock_put(sk); -out: -#endif - return alloc_skb(size, gfp_mask); -} -EXPORT_SYMBOL_GPL(__netlink_alloc_skb); - int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && - optlen >= sizeof(int) && + if (optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } err = 0; break; -#ifdef CONFIG_NETLINK_MMAP - case NETLINK_RX_RING: - case NETLINK_TX_RING: { - struct nl_mmap_req req; - - /* Rings might consume more memory than queue limits, require - * CAP_NET_ADMIN. - */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (optlen < sizeof(req)) - return -EINVAL; - if (copy_from_user(&req, optval, sizeof(req))) - return -EFAULT; - err = netlink_set_ring(sk, &req, - optname == NETLINK_TX_RING); - break; - } -#endif /* CONFIG_NETLINK_MMAP */ case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; @@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) smp_rmb(); } - /* It's a really convoluted way for userland to ask for mmaped - * sendmsg(), but that's what we've got... - */ - if (netlink_tx_is_mmaped(sk) && - iter_is_iovec(&msg->msg_iter) && - msg->msg_iter.nr_segs == 1 && - msg->msg_iter.iov->iov_base == NULL) { - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, - &scm); - goto out; - } - err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; @@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk) goto errout_skb; } - if (!netlink_rx_is_mmaped(sk) && - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being @@ -2808,15 +2082,12 @@ static int netlink_dump(struct sock *sk) if (alloc_min_size < nlk->max_recvmsg_len) { alloc_size = nlk->max_recvmsg_len; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); + skb = alloc_skb(alloc_size, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL); + skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; @@ -2831,8 +2102,7 @@ static int netlink_dump(struct sock *sk) * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ - if (!netlink_rx_is_mmaped(sk)) - skb_reserve(skb, skb_tailroom(skb) - alloc_size); + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -2884,16 +2154,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct netlink_sock *nlk; int ret; - /* Memory mapped dump requests need to be copied to avoid looping - * on the pending state in netlink_mmap_sendmsg() while the CB hold - * a reference to the skb. - */ - if (netlink_skb_is_mmaped(skb)) { - skb = skb_copy(skb, GFP_KERNEL); - if (skb == NULL) - return -ENOBUFS; - } else - atomic_inc(&skb->users); + atomic_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { @@ -2966,8 +2227,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); - skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), - NETLINK_CB(in_skb).portid, GFP_KERNEL); + skb = nlmsg_new(payload, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -3241,15 +2501,15 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = netlink_poll, - .ioctl = sock_no_ioctl, + .poll = datagram_poll, + .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = netlink_mmap, + .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 14437d9b1..e68ef9ccd 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -44,12 +44,6 @@ struct netlink_sock { int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); struct module *module; -#ifdef CONFIG_NETLINK_MMAP - struct mutex pg_vec_lock; - struct netlink_ring rx_ring; - struct netlink_ring tx_ring; - atomic_t mapped; -#endif /* CONFIG_NETLINK_MMAP */ struct rhash_head node; struct rcu_head rcu; @@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } -static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ -#ifdef CONFIG_NETLINK_MMAP - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -#else - return false; -#endif /* CONFIG_NETLINK_MMAP */ -} - struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 3ee63a3cf..8dd836a8d 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -8,41 +8,6 @@ #include "af_netlink.h" -#ifdef CONFIG_NETLINK_MMAP -static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, - struct sk_buff *nlskb) -{ - struct netlink_diag_ring ndr; - - ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; - ndr.ndr_block_nr = ring->pg_vec_len; - ndr.ndr_frame_size = ring->frame_size; - ndr.ndr_frame_nr = ring->frame_max + 1; - - return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); -} - -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - int ret; - - mutex_lock(&nlk->pg_vec_lock); - ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); - if (!ret) - ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, - nlskb); - mutex_unlock(&nlk->pg_vec_lock); - - return ret; -} -#else -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - return 0; -} -#endif - static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; - if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && - sk_diag_put_rings_cfg(sk, skb)) - goto out_nlmsg_trim; - nlmsg_end(skb, nlh); return 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f830326b3..a09132a69 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -463,26 +463,6 @@ int genl_unregister_family(struct genl_family *family) EXPORT_SYMBOL(genl_unregister_family); /** - * genlmsg_new_unicast - Allocate generic netlink message for unicast - * @payload: size of the message payload - * @info: information on destination - * @flags: the type of memory to allocate - * - * Allocates a new sk_buff large enough to cover the specified payload - * plus required Netlink headers. Will check receiving socket for - * memory mapped i/o capability and use it if enabled. Will fall back - * to non-mapped skb if message size exceeds the frame size of the ring. - */ -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags) -{ - size_t len = nlmsg_total_size(genlmsg_total_size(payload)); - - return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); -} -EXPORT_SYMBOL_GPL(genlmsg_new_unicast); - -/** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to @@ -580,6 +560,10 @@ static int genl_family_rcv_msg(struct genl_family *family, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { int rc; @@ -638,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; - info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 3621a902c..3425532c3 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -663,7 +663,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, return -ENOBUFS; } - msg_data = kzalloc(len, GFP_KERNEL); + msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN); if (msg_data == NULL) return -ENOMEM; @@ -729,7 +729,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, if (local == NULL) return -ENODEV; - msg_data = kzalloc(len, GFP_KERNEL); + msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN); if (msg_data == NULL) return -ENOMEM; diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ecf0a0196..b9edf5fae 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -509,6 +509,11 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, memset(llcp_addr, 0, sizeof(*llcp_addr)); *len = sizeof(struct sockaddr_nfc_llcp); + lock_sock(sk); + if (!llcp_sock->dev) { + release_sock(sk); + return -EBADFD; + } llcp_addr->sa_family = AF_NFC; llcp_addr->dev_idx = llcp_sock->dev->idx; llcp_addr->target_idx = llcp_sock->target_idx; @@ -518,6 +523,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, llcp_addr->service_name_len = llcp_sock->service_name_len; memcpy(llcp_addr->service_name, llcp_sock->service_name, llcp_addr->service_name_len); + release_sock(sk); return 0; } diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 21d887567..c468eabd6 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -171,14 +171,7 @@ static int nci_uart_tty_open(struct tty_struct *tty) tty->disc_data = NULL; tty->receive_room = 65536; - /* Flush any pending characters in the driver and line discipline. */ - - /* FIXME: why is this needed. Note don't use ldisc_ref here as the - * open path is before the ldisc is referencable. - */ - - if (tty->ldisc->ops->flush_buffer) - tty->ldisc->ops->flush_buffer(tty); + /* Flush any pending characters in the driver */ tty_driver_flush_buffer(tty); return 0; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d143aa9f6..ce947292a 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -6,10 +6,14 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET depends on !NF_CONNTRACK || \ - (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) + (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ + (!NF_NAT || NF_NAT) && \ + (!NF_NAT_IPV4 || NF_NAT_IPV4) && \ + (!NF_NAT_IPV6 || NF_NAT_IPV6))) select LIBCRC32C select MPLS select NET_MPLS_GSO + select DST_CACHE ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2d59df521..879185fe1 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, - MPLS_HLEN, 0)); + skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); hdr = eth_hdr(skb); hdr->h_proto = mpls->mpls_ethertype; @@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, mask->eth_dst); - ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); @@ -463,7 +461,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { - set_ipv6_addr(skb, key->ipv6_proto, saddr, masked, + set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked, true); memcpy(&flow_key->ipv6.addr.src, masked, sizeof(flow_key->ipv6.addr.src)); @@ -485,7 +483,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, NULL, &flags) != NEXTHDR_ROUTING); - set_ipv6_addr(skb, key->ipv6_proto, daddr, masked, + set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked, recalc_csum); memcpy(&flow_key->ipv6.addr.dst, masked, sizeof(flow_key->ipv6.addr.dst)); @@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); memcpy(skb->data, &data->l2_data, data->l2_len); - ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); skb_reset_mac_header(skb); ovs_vport_send(vport, skb); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index ee6ff8ffc..10c84d882 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -13,21 +13,31 @@ #include <linux/module.h> #include <linux/openvswitch.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> #include <net/ip.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#ifdef CONFIG_NF_NAT_NEEDED +#include <linux/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#endif + #include "datapath.h" #include "conntrack.h" #include "flow.h" #include "flow_netlink.h" struct ovs_ct_len_tbl { - size_t maxlen; - size_t minlen; + int maxlen; + int minlen; }; /* Metadata mark for masked write to conntrack mark */ @@ -42,15 +52,25 @@ struct md_labels { struct ovs_key_ct_labels mask; }; +enum ovs_ct_nat { + OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ + OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ + OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ +}; + /* Conntrack action context for execution. */ struct ovs_conntrack_info { struct nf_conntrack_helper *helper; struct nf_conntrack_zone zone; struct nf_conn *ct; u8 commit : 1; + u8 nat : 3; /* enum ovs_ct_nat */ u16 family; struct md_mark mark; struct md_labels labels; +#ifdef CONFIG_NF_NAT_NEEDED + struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ +#endif }; static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); @@ -75,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) switch (ctinfo) { case IP_CT_ESTABLISHED_REPLY: case IP_CT_RELATED_REPLY: - case IP_CT_NEW_REPLY: ct_state |= OVS_CS_F_REPLY_DIR; break; default: @@ -92,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) ct_state |= OVS_CS_F_RELATED; break; case IP_CT_NEW: - case IP_CT_NEW_REPLY: ct_state |= OVS_CS_F_NEW; break; default: @@ -139,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, ovs_ct_get_labels(ct, &key->ct.labels); } -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has - * previously sent the packet to conntrack via the ct action. +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. If + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are + * initialized from the connection status. */ static void ovs_ct_update_key(const struct sk_buff *skb, const struct ovs_conntrack_info *info, - struct sw_flow_key *key, bool post_ct) + struct sw_flow_key *key, bool post_ct, + bool keep_nat_flags) { const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; @@ -154,10 +175,22 @@ static void ovs_ct_update_key(const struct sk_buff *skb, ct = nf_ct_get(skb, &ctinfo); if (ct) { state = ovs_ct_get_state(ctinfo); + /* All unconfirmed entries are NEW connections. */ if (!nf_ct_is_confirmed(ct)) state |= OVS_CS_F_NEW; + /* OVS persists the related flag for the duration of the + * connection. + */ if (ct->master) state |= OVS_CS_F_RELATED; + if (keep_nat_flags) { + state |= key->ct.state & OVS_CS_F_NAT_MASK; + } else { + if (ct->status & IPS_SRC_NAT) + state |= OVS_CS_F_SRC_NAT; + if (ct->status & IPS_DST_NAT) + state |= OVS_CS_F_DST_NAT; + } zone = nf_ct_zone(ct); } else if (post_ct) { state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; @@ -167,9 +200,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb, __ovs_ct_update_key(key, state, zone, ct); } +/* This is called to initialize CT key fields possibly coming in from the local + * stack. + */ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { - ovs_ct_update_key(skb, NULL, key, false); + ovs_ct_update_key(skb, NULL, key, false, false); } int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) @@ -201,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, struct nf_conn *ct; u32 new_mark; - /* The connection could be invalid, in which case set_mark is no-op. */ ct = nf_ct_get(skb, &ctinfo); if (!ct) @@ -259,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) enum ip_conntrack_info ctinfo; unsigned int protoff; struct nf_conn *ct; + int err; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) @@ -295,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) return NF_DROP; } - return helper->help(skb, protoff, ct, ctinfo); + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; } /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero @@ -320,6 +367,7 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) @@ -352,14 +400,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, return __nf_ct_expect_find(net, zone, &tuple); } +/* This replicates logic from nf_conntrack_core.c that is not exported. */ +static enum ip_conntrack_info +ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) +{ + const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + return IP_CT_ESTABLISHED_REPLY; + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + return IP_CT_ESTABLISHED; + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + return IP_CT_RELATED; + return IP_CT_NEW; +} + +/* Find an existing connection which this packet belongs to without + * re-attributing statistics or modifying the connection state. This allows an + * skb->nfct lost due to an upcall to be recovered during actions execution. + * + * Must be called with rcu_read_lock. + * + * On success, populates skb->nfct and skb->nfctinfo, and returns the + * connection. Returns NULL if there is no existing entry. + */ +static struct nf_conn * +ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, + u8 l3num, struct sk_buff *skb) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + unsigned int dataoff; + u8 protonum; + + l3proto = __nf_ct_l3proto_find(l3num); + if (!l3proto) { + pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); + return NULL; + } + if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, + &protonum) <= 0) { + pr_debug("ovs_ct_find_existing: Can't get protonum\n"); + return NULL; + } + l4proto = __nf_ct_l4proto_find(l3num, protonum); + if (!l4proto) { + pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); + return NULL; + } + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, + protonum, net, &tuple, l3proto, l4proto)) { + pr_debug("ovs_ct_find_existing: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return NULL; /* Not found. */ + + ct = nf_ct_tuplehash_to_ctrack(h); + + ctinfo = ovs_ct_get_info(h); + if (ctinfo == IP_CT_NEW) { + /* This should not happen. */ + WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct); + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = ctinfo; + return ct; +} + /* Determine whether skb->nfct is equal to the result of conntrack lookup. */ -static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, - const struct ovs_conntrack_info *info) +static bool skb_nfct_cached(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); + /* If no ct, check if we have evidence that an existing conntrack entry + * might be found for this skb. This happens when we lose a skb->nfct + * due to an upcall. If the connection was not confirmed, it is not + * cached and needs to be run through conntrack again. + */ + if (!ct && key->ct.state & OVS_CS_F_TRACKED && + !(key->ct.state & OVS_CS_F_INVALID) && + key->ct.zone == info->zone.id) + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); if (!ct) return false; if (!net_eq(net, read_pnet(&ct->ct_net))) @@ -377,6 +512,207 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, return true; } +#ifdef CONFIG_NF_NAT_NEEDED +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype) +{ + int hooknum, nh_off, err = NF_ACCEPT; + + nh_off = skb_network_offset(skb); + skb_pull(skb, nh_off); + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && + skb->protocol == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto push; + } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && + skb->protocol == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto push; + } + } + /* Non-ICMP, fall thru to initialize if needed. */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto push; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto push; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); +push: + skb_push(skb, nh_off); + + return err; +} + +static void ovs_nat_update_key(struct sw_flow_key *key, + const struct sk_buff *skb, + enum nf_nat_manip_type maniptype) +{ + if (maniptype == NF_NAT_MANIP_SRC) { + __be16 src; + + key->ct.state |= OVS_CS_F_SRC_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.src = ip_hdr(skb)->saddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, + sizeof(key->ipv6.addr.src)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + src = udp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_TCP) + src = tcp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_SCTP) + src = sctp_hdr(skb)->source; + else + return; + + key->tp.src = src; + } else { + __be16 dst; + + key->ct.state |= OVS_CS_F_DST_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.dst = ip_hdr(skb)->daddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, + sizeof(key->ipv6.addr.dst)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + dst = udp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_TCP) + dst = tcp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_SCTP) + dst = sctp_hdr(skb)->dest; + else + return; + + key->tp.dst = dst; + } +} + +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + enum nf_nat_manip_type maniptype; + int err; + + if (nf_ct_is_untracked(ct)) { + /* A NAT action may only be performed on tracked packets. */ + return NF_ACCEPT; + } + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_ACCEPT; /* Can't NAT. */ + + /* Determine NAT type. + * Check if the NAT type can be deduced from the tracked connection. + * Make sure new expected connections (IP_CT_RELATED) are NATted only + * when committing. + */ + if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && + ct->status & IPS_NAT_MASK && + (ctinfo != IP_CT_RELATED || info->commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (info->nat & OVS_CT_SRC_NAT) { + maniptype = NF_NAT_MANIP_SRC; + } else if (info->nat & OVS_CT_DST_NAT) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; /* Connection is not NATed. */ + } + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); + + /* Mark NAT done if successful and update the flow key. */ + if (err == NF_ACCEPT) + ovs_nat_update_key(key, skb, maniptype); + + return err; +} +#else /* !CONFIG_NF_NAT_NEEDED */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + return NF_ACCEPT; +} +#endif + +/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if + * not done already. Update key with new CT state after passing the packet + * through conntrack. + * Note that if the packet is deemed invalid by conntrack, skb->nfct will be + * set to NULL and 0 will be returned. + */ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) @@ -386,8 +722,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * actually run the packet through conntrack twice unless it's for a * different zone. */ - if (!skb_nfct_cached(net, skb, info)) { + bool cached = skb_nfct_cached(net, key, info, skb); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!cached) { struct nf_conn *tmpl = info->ct; + int err; /* Associate skb with specified zone. */ if (tmpl) { @@ -398,17 +739,66 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, skb->nfctinfo = IP_CT_NEW; } - if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, - skb) != NF_ACCEPT) + /* Repeat if requested, see nf_iterate(). */ + do { + err = nf_conntrack_in(net, info->family, + NF_INET_PRE_ROUTING, skb); + } while (err == NF_REPEAT); + + if (err != NF_ACCEPT) return -ENOENT; - if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { - WARN_ONCE(1, "helper rejected packet"); + /* Clear CT state NAT flags to mark that we have not yet done + * NAT after the nf_conntrack_in() call. We can actually clear + * the whole state, as it will be re-initialized below. + */ + key->ct.state = 0; + + /* Update the key, but keep the NAT flags. */ + ovs_ct_update_key(skb, info, key, true, true); + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + /* Packets starting a new connection must be NATted before the + * helper, so that the helper knows about the NAT. We enforce + * this by delaying both NAT and helper calls for unconfirmed + * connections until the committing CT action. For later + * packets NAT and Helper may be called in either order. + * + * NAT will be done only if the CT action has NAT, and only + * once per packet (per zone), as guarded by the NAT bits in + * the key->ct.state. + */ + if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && + (nf_ct_is_confirmed(ct) || info->commit) && + ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { return -EINVAL; } - } - ovs_ct_update_key(skb, info, key, true); + /* Userspace may decide to perform a ct lookup without a helper + * specified followed by a (recirculate and) commit with one. + * Therefore, for unconfirmed connections which we will commit, + * we need to attach the helper here. + */ + if (!nf_ct_is_confirmed(ct) && info->commit && + info->helper && !nfct_help(ct)) { + int err = __nf_ct_try_assign_helper(ct, info->ct, + GFP_ATOMIC); + if (err) + return err; + } + + /* Call the helper only if: + * - nf_conntrack_in() was executed above ("!cached") for a + * confirmed connection, or + * - When committing an unconfirmed connection. + */ + if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + return -EINVAL; + } + } return 0; } @@ -420,19 +810,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, { struct nf_conntrack_expect *exp; + /* If we pass an expected packet through nf_conntrack_in() the + * expectation is typically removed, but the packet could still be + * lost in upcall processing. To prevent this from happening we + * perform an explicit expectation lookup. Expected connections are + * always new, and will be passed through conntrack only when they are + * committed, as it is OK to remove the expectation at that time. + */ exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); if (exp) { u8 state; + /* NOTE: New connections are NATted and Helped only when + * committed, so we are not calling into NAT here. + */ state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - } + } else + return __ovs_ct_lookup(net, key, info, skb); return 0; } @@ -442,21 +837,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - u8 state; int err; - state = key->ct.state; - if (key->ct.zone == info->zone.id && - ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { - /* Previous lookup has shown that this connection is already - * tracked and committed. Skip committing. - */ - return 0; - } - err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; + /* This is a no-op if the connection has already been confirmed. */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) return -EINVAL; @@ -541,6 +927,136 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return 0; } +#ifdef CONFIG_NF_NAT_NEEDED +static int parse_nat(const struct nlattr *attr, + struct ovs_conntrack_info *info, bool log) +{ + struct nlattr *a; + int rem; + bool have_ip_max = false; + bool have_proto_max = false; + bool ip_vers = (info->family == NFPROTO_IPV6); + + nla_for_each_nested(a, attr, rem) { + static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { + [OVS_NAT_ATTR_SRC] = {0, 0}, + [OVS_NAT_ATTR_DST] = {0, 0}, + [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, + [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, + [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, + }; + int type = nla_type(a); + + if (type > OVS_NAT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown NAT attribute (type=%d, max=%d).\n", + type, OVS_NAT_ATTR_MAX); + return -EINVAL; + } + + if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { + OVS_NLERR(log, + "NAT attribute type %d has unexpected length (%d != %d).\n", + type, nla_len(a), + ovs_nat_attr_lens[type][ip_vers]); + return -EINVAL; + } + + switch (type) { + case OVS_NAT_ATTR_SRC: + case OVS_NAT_ATTR_DST: + if (info->nat) { + OVS_NLERR(log, + "Only one type of NAT may be specified.\n" + ); + return -ERANGE; + } + info->nat |= OVS_CT_NAT; + info->nat |= ((type == OVS_NAT_ATTR_SRC) + ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); + break; + + case OVS_NAT_ATTR_IP_MIN: + nla_memcpy(&info->range.min_addr, a, + sizeof(info->range.min_addr)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_IP_MAX: + have_ip_max = true; + nla_memcpy(&info->range.max_addr, a, + sizeof(info->range.max_addr)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_PROTO_MIN: + info->range.min_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PROTO_MAX: + have_proto_max = true; + info->range.max_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PERSISTENT: + info->range.flags |= NF_NAT_RANGE_PERSISTENT; + break; + + case OVS_NAT_ATTR_PROTO_HASH: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; + break; + + case OVS_NAT_ATTR_PROTO_RANDOM: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; + break; + + default: + OVS_NLERR(log, "Unknown nat attribute (%d).\n", type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem); + return -EINVAL; + } + if (!info->nat) { + /* Do not allow flags if no type is given. */ + if (info->range.flags) { + OVS_NLERR(log, + "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n" + ); + return -EINVAL; + } + info->nat = OVS_CT_NAT; /* NAT existing connections. */ + } else if (!info->commit) { + OVS_NLERR(log, + "NAT attributes may be specified only when CT COMMIT flag is also specified.\n" + ); + return -EINVAL; + } + /* Allow missing IP_MAX. */ + if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { + memcpy(&info->range.max_addr, &info->range.min_addr, + sizeof(info->range.max_addr)); + } + /* Allow missing PROTO_MAX. */ + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + !have_proto_max) { + info->range.max_proto.all = info->range.min_proto.all; + } + return 0; +} +#endif + static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), @@ -550,7 +1066,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), .maxlen = sizeof(struct md_labels) }, [OVS_CT_ATTR_HELPER] = { .minlen = 1, - .maxlen = NF_CT_HELPER_NAME_LEN } + .maxlen = NF_CT_HELPER_NAME_LEN }, +#ifdef CONFIG_NF_NAT_NEEDED + /* NAT length is checked when parsing the nested attributes. */ + [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, +#endif }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -617,6 +1137,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, return -EINVAL; } break; +#ifdef CONFIG_NF_NAT_NEEDED + case OVS_CT_ATTR_NAT: { + int err = parse_nat(a, info, log); + + if (err) + return err; + break; + } +#endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", type); @@ -704,6 +1233,74 @@ err_free_ct: return err; } +#ifdef CONFIG_NF_NAT_NEEDED +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_CT_ATTR_NAT); + if (!start) + return false; + + if (info->nat & OVS_CT_SRC_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) + return false; + } else if (info->nat & OVS_CT_DST_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) + return false; + } else { + goto out; + } + + if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { + if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && + info->family == NFPROTO_IPV4) { + if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, + info->range.min_addr.ip) || + (info->range.max_addr.ip + != info->range.min_addr.ip && + (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, + info->range.max_addr.ip)))) + return false; + } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && + info->family == NFPROTO_IPV6) { + if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, + &info->range.min_addr.in6) || + (memcmp(&info->range.max_addr.in6, + &info->range.min_addr.in6, + sizeof(info->range.max_addr.in6)) && + (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, + &info->range.max_addr.in6)))) + return false; + } else { + return false; + } + } + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, + ntohs(info->range.min_proto.all)) || + (info->range.max_proto.all != info->range.min_proto.all && + nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, + ntohs(info->range.max_proto.all))))) + return false; + + if (info->range.flags & NF_NAT_RANGE_PERSISTENT && + nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) + return false; +out: + nla_nest_end(skb, start); + + return true; +} +#endif + int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, struct sk_buff *skb) { @@ -732,7 +1329,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, ct_info->helper->name)) return -EMSGSIZE; } - +#ifdef CONFIG_NF_NAT_NEEDED + if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) + return -EMSGSIZE; +#endif nla_nest_end(skb, start); return 0; diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index a7544f405..8f6230bd6 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a); #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ - OVS_CS_F_INVALID | OVS_CS_F_TRACKED) + OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \ + OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) #else #include <linux/errno.h> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index deadfdab1..0cc66a4e4 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; - struct genl_info info = { - .dst_sk = ovs_dp_get_net(dp)->genl_sock, - .snd_portid = upcall_info->portid, - }; size_t len; unsigned int hlen; int err, dp_ifindex; @@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, hlen = skb->len; len = upcall_msg_size(upcall_info, hlen); - user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -654,7 +650,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = packet_policy, .doit = ovs_packet_cmd_execute } @@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); - skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -1100,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sw_flow_match match; struct sw_flow_id sfid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int error; + int error = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; - /* Extract key. */ - error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR(log, "Flow key attribute not present in set flow."); - goto error; - } - ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); - ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], - a[OVS_FLOW_ATTR_MASK], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + } else if (!ufid_present) { + OVS_NLERR(log, + "Flow set message rejected, Key attribute missing."); + error = -EINVAL; + } if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, + "Flow key attribute not present in set flow."); + error = -EINVAL; + goto error; + } + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, log); if (IS_ERR(acts)) { @@ -1391,12 +1393,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_del }, @@ -1407,7 +1409,7 @@ static const struct genl_ops dp_flow_genl_ops[] = { .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_set, }, @@ -1481,9 +1483,9 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +static struct sk_buff *ovs_dp_cmd_alloc_info(void) { - return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); + return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ @@ -1536,7 +1538,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1657,7 +1659,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1690,7 +1692,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1723,7 +1725,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1777,12 +1779,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_del }, @@ -1793,7 +1795,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = { .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_set, }, @@ -1912,6 +1914,29 @@ static struct vport *lookup_vport(struct net *net, return ERR_PTR(-EINVAL); } +/* Called with ovs_mutex */ +static void update_headroom(struct datapath *dp) +{ + unsigned dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + dp->max_headroom = max_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + netdev_set_rx_headroom(vport->dev, max_headroom); +} + static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1977,6 +2002,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); + + if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) + update_headroom(dp); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + BUG_ON(err < 0); ovs_unlock(); @@ -2043,8 +2074,10 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool must_update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; int err; @@ -2066,7 +2099,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL); BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + must_update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); + + if (must_update_headroom) + update_headroom(dp); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); @@ -2158,12 +2200,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_del }, @@ -2174,7 +2216,7 @@ static const struct genl_ops dp_vport_genl_ops[] = { .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_set, }, diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 67bdecd9f..427e39a04 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -68,6 +68,8 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @max_headroom: the maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -89,6 +91,8 @@ struct datapath { possible_net_t net; u32 user_features; + + u32 max_headroom; }; /** diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1d055c559..03378e75a 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -55,7 +55,7 @@ struct ovs_tunnel_info { FIELD_SIZEOF(struct sw_flow_key, recirc_id)) struct sw_flow_key { - u8 tun_opts[255]; + u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1bd4a45c..689c17264 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (!tun_dst) return -ENOMEM; + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, sizeof(*ovs_tun), log); if (IS_ERR(a)) { @@ -2038,9 +2044,6 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_TUNNEL: - if (eth_p_mpls(eth_type)) - return -EINVAL; - if (masked) return -EINVAL; /* Masked tunnel set not supported. */ diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 30ab8e127..1a1fcec88 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -132,6 +132,6 @@ static void __exit ovs_geneve_tnl_exit(void) module_init(ovs_geneve_tnl_init); module_exit(ovs_geneve_tnl_exit); -MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_DESCRIPTION("OVS: Geneve switching port"); MODULE_LICENSE("GPL"); MODULE_ALIAS("vport-type-5"); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index ec76398a7..7c8b90bf0 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } +static void internal_set_rx_headroom(struct net_device *dev, int new_hr) +{ + dev->needed_headroom = new_hr; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, @@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, + .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_PHONY_HEADROOM; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } + vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 6a6adf314..4e3972344 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb) return; skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index c10899cb9..f01f28a56 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv) int ovs_vport_receive(struct vport *, struct sk_buff *, const struct ip_tunnel_info *); -static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); -} - static inline const char *ovs_vport_name(struct vport *vport) { return vport->dev->name; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index da1ae0e13..18d0becbc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -557,9 +557,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, { struct net_device *dev; unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; - struct ethtool_cmd ecmd; + struct ethtool_link_ksettings ecmd; int err; - u32 speed; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); @@ -567,19 +566,19 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } - err = __ethtool_get_settings(dev, &ecmd); - speed = ethtool_cmd_speed(&ecmd); + err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (!err) { /* * If the link speed is so slow you don't really * need to worry about perf anyways */ - if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) { return DEFAULT_PRB_RETIRE_TOV; } else { msec = 1; - div = speed / 1000; + div = ecmd.base.speed / 1000; } } @@ -1964,6 +1963,64 @@ static unsigned int run_filter(struct sk_buff *skb, return res; } +static int __packet_rcv_vnet(const struct sk_buff *skb, + struct virtio_net_hdr *vnet_hdr) +{ + *vnet_hdr = (const struct virtio_net_hdr) { 0 }; + + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + /* This is a hint as to how much should be linear. */ + vnet_hdr->hdr_len = + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); + vnet_hdr->gso_size = + __cpu_to_virtio16(vio_le(), sinfo->gso_size); + + if (sinfo->gso_type & SKB_GSO_TCPV4) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; + else if (sinfo->gso_type & SKB_GSO_FCOE) + return -EINVAL; + else + BUG(); + + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(), + skb_checksum_start_offset(skb)); + vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(), + skb->csum_offset); + } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; + } /* else everything is zero */ + + return 0; +} + +static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, + size_t *len) +{ + struct virtio_net_hdr vnet_hdr; + + if (*len < sizeof(vnet_hdr)) + return -EINVAL; + *len -= sizeof(vnet_hdr); + + if (__packet_rcv_vnet(skb, &vnet_hdr)) + return -EINVAL; + + return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); +} + /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. @@ -2152,7 +2209,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + - po->tp_reserve; + po->tp_reserve; + if (po->has_vnet_hdr) + netoff += sizeof(struct virtio_net_hdr); macoff = netoff - maclen; } if (po->tp_version <= TPACKET_V2) { @@ -2189,7 +2248,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) - goto ring_is_full; + goto drop_n_account; if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* @@ -2208,6 +2267,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } spin_unlock(&sk->sk_receive_queue.lock); + if (po->has_vnet_hdr) { + if (__packet_rcv_vnet(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr))) { + spin_lock(&sk->sk_receive_queue.lock); + goto drop_n_account; + } + } + skb_copy_bits(skb, 0, h.raw + macoff, snaplen); if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) @@ -2303,7 +2370,7 @@ drop: kfree_skb(skb); return 0; -ring_is_full: +drop_n_account: po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); @@ -2339,15 +2406,92 @@ static void tpacket_set_protocol(const struct net_device *dev, } } +static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) +{ + unsigned short gso_type = 0; + + if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) + vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); + + if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) + return -EINVAL; + + if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + gso_type = SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + gso_type = SKB_GSO_UDP; + break; + default: + return -EINVAL; + } + + if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + gso_type |= SKB_GSO_TCP_ECN; + + if (vnet_hdr->gso_size == 0) + return -EINVAL; + } + + vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */ + return 0; +} + +static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, + struct virtio_net_hdr *vnet_hdr) +{ + int n; + + if (*len < sizeof(*vnet_hdr)) + return -EINVAL; + *len -= sizeof(*vnet_hdr); + + n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter); + if (n != sizeof(*vnet_hdr)) + return -EFAULT; + + return __packet_snd_vnet_parse(vnet_hdr, *len); +} + +static int packet_snd_vnet_gso(struct sk_buff *skb, + struct virtio_net_hdr *vnet_hdr) +{ + if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset); + + if (!skb_partial_csum_set(skb, s, o)) + return -EINVAL; + } + + skb_shinfo(skb)->gso_size = + __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size); + skb_shinfo(skb)->gso_type = vnet_hdr->gso_type; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + return 0; +} + static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, - void *frame, struct net_device *dev, int size_max, - __be16 proto, unsigned char *addr, int hlen) + void *frame, struct net_device *dev, void *data, int tp_len, + __be16 proto, unsigned char *addr, int hlen, int copylen) { union tpacket_uhdr ph; - int to_write, offset, len, tp_len, nr_frags, len_max; + int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; - void *data; int err; ph.raw = frame; @@ -2359,51 +2503,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; - switch (po->tp_version) { - case TPACKET_V2: - tp_len = ph.h2->tp_len; - break; - default: - tp_len = ph.h1->tp_len; - break; - } - if (unlikely(tp_len > size_max)) { - pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); - return -EMSGSIZE; - } - skb_reserve(skb, hlen); skb_reset_network_header(skb); - if (unlikely(po->tp_tx_has_off)) { - int off_min, off_max, off; - off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); - off_max = po->tx_ring.frame_size - tp_len; - if (sock->type == SOCK_DGRAM) { - switch (po->tp_version) { - case TPACKET_V2: - off = ph.h2->tp_net; - break; - default: - off = ph.h1->tp_net; - break; - } - } else { - switch (po->tp_version) { - case TPACKET_V2: - off = ph.h2->tp_mac; - break; - default: - off = ph.h1->tp_mac; - break; - } - } - if (unlikely((off < off_min) || (off_max < off))) - return -EINVAL; - data = ph.raw + off; - } else { - data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); - } to_write = tp_len; if (sock->type == SOCK_DGRAM) { @@ -2411,10 +2513,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; - } else if (dev->hard_header_len) { - int hdrlen = min_t(int, dev->hard_header_len, tp_len); + } else if (copylen) { + int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); + skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; @@ -2461,10 +2564,66 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, return tp_len; } +static int tpacket_parse_header(struct packet_sock *po, void *frame, + int size_max, void **data) +{ + union tpacket_uhdr ph; + int tp_len, off; + + ph.raw = frame; + + switch (po->tp_version) { + case TPACKET_V2: + tp_len = ph.h2->tp_len; + break; + default: + tp_len = ph.h1->tp_len; + break; + } + if (unlikely(tp_len > size_max)) { + pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); + return -EMSGSIZE; + } + + if (unlikely(po->tp_tx_has_off)) { + int off_min, off_max; + + off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); + off_max = po->tx_ring.frame_size - tp_len; + if (po->sk.sk_type == SOCK_DGRAM) { + switch (po->tp_version) { + case TPACKET_V2: + off = ph.h2->tp_net; + break; + default: + off = ph.h1->tp_net; + break; + } + } else { + switch (po->tp_version) { + case TPACKET_V2: + off = ph.h2->tp_mac; + break; + default: + off = ph.h1->tp_mac; + break; + } + } + if (unlikely((off < off_min) || (off_max < off))) + return -EINVAL; + } else { + off = po->tp_hdrlen - sizeof(struct sockaddr_ll); + } + + *data = frame + off; + return tp_len; +} + static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb; struct net_device *dev; + struct virtio_net_hdr *vnet_hdr = NULL; __be16 proto; int err, reserve = 0; void *ph; @@ -2472,9 +2631,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int tp_len, size_max; unsigned char *addr; + void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; - int hlen, tlen; + int hlen, tlen, copylen = 0; mutex_lock(&po->pg_vec_lock); @@ -2507,7 +2667,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if (size_max > dev->mtu + reserve + VLAN_HLEN) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) size_max = dev->mtu + reserve + VLAN_HLEN; do { @@ -2519,11 +2679,30 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) continue; } + skb = NULL; + tp_len = tpacket_parse_header(po, ph, size_max, &data); + if (tp_len < 0) + goto tpacket_error; + status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; + if (po->has_vnet_hdr) { + vnet_hdr = data; + data += sizeof(*vnet_hdr); + tp_len -= sizeof(*vnet_hdr); + if (tp_len < 0 || + __packet_snd_vnet_parse(vnet_hdr, tp_len)) { + tp_len = -EINVAL; + goto tpacket_error; + } + copylen = __virtio16_to_cpu(vio_le(), + vnet_hdr->hdr_len); + } + copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, - hlen + tlen + sizeof(struct sockaddr_ll), + hlen + tlen + sizeof(struct sockaddr_ll) + + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { @@ -2532,14 +2711,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) err = len_sum; goto out_status; } - tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, - addr, hlen); + tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, + addr, hlen, copylen); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && + !po->has_vnet_hdr && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { +tpacket_error: if (po->tp_loss) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); @@ -2553,6 +2734,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } + if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) { + tp_len = -EINVAL; + goto tpacket_error; + } + packet_pick_tx_queue(dev, skb); skb->destructor = tpacket_destruct_skb; @@ -2635,12 +2821,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; - int vnet_hdr_len; struct packet_sock *po = pkt_sk(sk); - unsigned short gso_type = 0; int hlen, tlen; int extra_len = 0; - ssize_t n; /* * Get and verify the address. @@ -2678,53 +2861,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { - vnet_hdr_len = sizeof(vnet_hdr); - - err = -EINVAL; - if (len < vnet_hdr_len) - goto out_unlock; - - len -= vnet_hdr_len; - - err = -EFAULT; - n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter); - if (n != vnet_hdr_len) - goto out_unlock; - - if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); - - err = -EINVAL; - if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); + if (err) goto out_unlock; - - if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - gso_type = SKB_GSO_TCPV4; - break; - case VIRTIO_NET_HDR_GSO_TCPV6: - gso_type = SKB_GSO_TCPV6; - break; - case VIRTIO_NET_HDR_GSO_UDP: - gso_type = SKB_GSO_UDP; - break; - default: - goto out_unlock; - } - - if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) - gso_type |= SKB_GSO_TCP_ECN; - - if (vnet_hdr.gso_size == 0) - goto out_unlock; - - } } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2736,7 +2875,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } err = -EMSGSIZE; - if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) + if (!vnet_hdr.gso_type && + (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; @@ -2770,7 +2910,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (!gso_type && (len > dev->mtu + reserve + extra_len) && + if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; @@ -2784,24 +2924,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { - if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); - if (!skb_partial_csum_set(skb, s, o)) { - err = -EINVAL; - goto out_free; - } - } - - skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); - skb_shinfo(skb)->gso_type = gso_type; - - /* Header must be checked, and gso_segs computed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; - - len += vnet_hdr_len; + err = packet_snd_vnet_gso(skb, &vnet_hdr); + if (err) + goto out_free; + len += sizeof(vnet_hdr); } skb_probe_transport_header(skb, reserve); @@ -3172,51 +3298,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_has_room(pkt_sk(sk), NULL); if (pkt_sk(sk)->has_vnet_hdr) { - struct virtio_net_hdr vnet_hdr = { 0 }; - - err = -EINVAL; - vnet_hdr_len = sizeof(vnet_hdr); - if (len < vnet_hdr_len) - goto out_free; - - len -= vnet_hdr_len; - - if (skb_is_gso(skb)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - /* This is a hint as to how much should be linear. */ - vnet_hdr.hdr_len = - __cpu_to_virtio16(vio_le(), skb_headlen(skb)); - vnet_hdr.gso_size = - __cpu_to_virtio16(vio_le(), sinfo->gso_size); - if (sinfo->gso_type & SKB_GSO_TCPV4) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; - else if (sinfo->gso_type & SKB_GSO_FCOE) - goto out_free; - else - BUG(); - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), - skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), - skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ - - err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len); - if (err < 0) + err = packet_rcv_vnet(msg, skb, &len); + if (err) goto out_free; + vnet_hdr_len = sizeof(struct virtio_net_hdr); } /* You lose any data beyond the buffer you gave. If it worries @@ -3436,6 +3521,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); + memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; @@ -3547,8 +3633,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv } if (optlen < len) return -EINVAL; - if (pkt_sk(sk)->has_vnet_hdr) - return -EINVAL; if (copy_from_user(&req_u.req, optval, len)) return -EFAULT; return packet_set_ring(sk, &req_u, 0, @@ -4068,7 +4152,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { - WARN(1, "Tx-ring is not supported.\n"); + net_warn_ratelimited("Tx-ring is not supported.\n"); goto out; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index d575ef4e9..ffd5f2297 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) rcu_read_unlock(); } -void pn_sock_hash(struct sock *sk) +int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); + + return 0; } EXPORT_SYMBOL(pn_sock_hash); @@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) pn->resource = spn->spn_resource; /* Enable RX on the socket */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: diff --git a/net/rds/Kconfig b/net/rds/Kconfig index f2c670ba7..bffde4b46 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -4,14 +4,13 @@ config RDS depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, - sequenced delivery of datagrams over Infiniband, iWARP, - or TCP. + sequenced delivery of datagrams over Infiniband or TCP. config RDS_RDMA - tristate "RDS over Infiniband and iWARP" + tristate "RDS over Infiniband" depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS ---help--- - Allow RDS to use Infiniband and iWARP as a transport. + Allow RDS to use Infiniband as a transport. This transport supports RDMA operations. config RDS_TCP diff --git a/net/rds/Makefile b/net/rds/Makefile index 56d3f6023..0e72bec15 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ obj-$(CONFIG_RDS_RDMA) += rds_rdma.o rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ - ib_sysctl.o ib_rdma.o \ - iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ - iw_sysctl.o iw_rdma.o + ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o obj-$(CONFIG_RDS_TCP) += rds_tcp.o diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index b5476aebd..6beaeb113 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, return rs->rs_transport ? 0 : -ENOPROTOOPT; } +static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, + int optlen) +{ + int val, valbool; + + if (optlen != sizeof(int)) + return -EFAULT; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + valbool = val ? 1 : 0; + + if (valbool) + sock_set_flag(sk, SOCK_RCVTSTAMP); + else + sock_reset_flag(sk, SOCK_RCVTSTAMP); + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ret = rds_set_transport(rs, optval, optlen); release_sock(sock->sk); break; + case SO_TIMESTAMP: + lock_sock(sock->sk); + ret = rds_enable_recvtstamp(sock->sk, optval, optlen); + release_sock(sock->sk); + break; default: ret = -ENOPROTOOPT; } diff --git a/net/rds/cong.c b/net/rds/cong.c index e6144b824..6641bcf7c 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -299,7 +299,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - __set_bit_le(off, (void *)map->m_page_addrs[i]); + set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) @@ -313,7 +313,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - __clear_bit_le(off, (void *)map->m_page_addrs[i]); + clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) diff --git a/net/rds/ib.c b/net/rds/ib.c index 9481d55ff..b5342fdda 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -42,15 +42,16 @@ #include "rds.h" #include "ib.h" +#include "ib_mr.h" -unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; -unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; +unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; +unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; -module_param(rds_ib_fmr_1m_pool_size, int, 0444); -MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA"); -module_param(rds_ib_fmr_8k_pool_size, int, 0444); -MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA"); +module_param(rds_ib_mr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); +module_param(rds_ib_mr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); @@ -139,14 +140,20 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); + rds_ibdev->has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), - rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; + rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; - rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + rds_ibdev->max_8k_mrs = device->attrs.max_mr ? min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), - rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; + rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size; rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; @@ -172,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, - rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, - rds_ibdev->max_8k_fmrs); + rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs, + rds_ibdev->max_8k_mrs); + + pr_info("RDS/IB: %s: %s supported and preferred\n", + device->name, + rds_ibdev->use_fastreg ? "FRMR" : "FMR"); INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); @@ -364,7 +375,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); - rds_ib_fmr_exit(); + rds_ib_mr_exit(); } struct rds_transport rds_ib_transport = { @@ -400,13 +411,13 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = rds_ib_fmr_init(); + ret = rds_ib_mr_init(); if (ret) goto out; ret = ib_register_client(&rds_ib_client); if (ret) - goto out_fmr_exit; + goto out_mr_exit; ret = rds_ib_sysctl_init(); if (ret) @@ -430,8 +441,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); -out_fmr_exit: - rds_ib_fmr_exit(); +out_mr_exit: + rds_ib_mr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index b3fdebb57..627fb79ae 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -9,17 +9,12 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_1M_POOL_SIZE (8192 / 2) -#define RDS_FMR_1M_MSG_SIZE 256 -#define RDS_FMR_8K_MSG_SIZE 2 -#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1)) -#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) - #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_FR_WR 512 #define RDS_IB_DEFAULT_RETRY_COUNT 2 @@ -28,7 +23,6 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 #define RDS_IB_WC_MAX 32 -#define RDS_IB_SEND_OP BIT_ULL(63) extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -129,6 +123,9 @@ struct rds_ib_connection { struct ib_wc i_send_wc[RDS_IB_WC_MAX]; struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; + /* To control the number of wrs from fastreg */ + atomic_t i_fastreg_wrs; + /* interrupt handling */ struct tasklet_struct i_send_tasklet; struct tasklet_struct i_recv_tasklet; @@ -207,12 +204,16 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - unsigned int max_fmrs; + bool has_fmr; + bool has_fr; + bool use_fastreg; + + unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; struct rds_ib_mr_pool *mr_8k_pool; unsigned int fmr_max_remaps; - unsigned int max_8k_fmrs; - unsigned int max_1m_fmrs; + unsigned int max_8k_mrs; + unsigned int max_1m_mrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -266,6 +267,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_pool_flush; uint64_t s_ib_rdma_mr_1m_pool_wait; uint64_t s_ib_rdma_mr_1m_pool_depleted; + uint64_t s_ib_rdma_mr_8k_reused; + uint64_t s_ib_rdma_mr_1m_reused; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; }; @@ -317,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int rds_ib_fmr_1m_pool_size; -extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; @@ -348,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, - int npages); -void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); -void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); -void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret); -void rds_ib_sync_mr(void *trans_private, int dir); -void rds_ib_free_mr(void *trans_private, int invalidate); -void rds_ib_flush_mrs(void); -int rds_ib_fmr_init(void); -void rds_ib_fmr_exit(void); +void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); /* ib_recv.c */ int rds_ib_recv_init(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index da5a7fb98..310cabce2 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -194,7 +194,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = rds_ib_piggyb_ack(ic); + dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); /* Advertise flow control */ if (ic->i_flowctl) { @@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) tasklet_schedule(&ic->i_recv_tasklet); } -static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, - struct ib_wc *wcs, - struct rds_ib_ack_state *ack_state) +static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs) { - int nr; - int i; + int nr, i; struct ib_wc *wc; while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { @@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, (unsigned long long)wc->wr_id, wc->status, wc->byte_len, be32_to_cpu(wc->ex.imm_data)); - if (wc->wr_id & RDS_IB_SEND_OP) + if (wc->wr_id <= ic->i_send_ring.w_nr || + wc->wr_id == RDS_IB_ACK_WR_ID) rds_ib_send_cqe_handler(ic, wc); else - rds_ib_recv_cqe_handler(ic, wc, ack_state); + rds_ib_mr_cqe_handler(ic, wc); + } } } @@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state; rds_ib_stats_inc(s_ib_tasklet_call); - memset(&state, 0, sizeof(state)); - poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); - poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); if (rds_conn_up(conn) && (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || @@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data) rds_send_xmit(ic->conn); } +static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr, i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + wc->byte_len, be32_to_cpu(wc->ex.imm_data)); + + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + static void rds_ib_tasklet_fn_recv(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; @@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); memset(&state, 0, sizeof(state)); - poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); if (state.ack_next_valid) rds_ib_set_ack(ic, state.ack_next, state.ack_required); @@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; - int ret; + int ret, fr_queue_space; /* * It's normal to see a null device if an incoming connection races @@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!rds_ibdev) return -EOPNOTSUPP; + /* The fr_queue_space is currently set to 512, to add extra space on + * completion queue and send queue. This extra space is used for FRMR + * registration and invalidation work requests + */ + fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; - cq_attr.cqe = ic->i_send_ring.w_nr + 1; + cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, @@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.event_handler = rds_ib_qp_event_handler; attr.qp_context = conn; /* + 1 to allow for the single ack message */ - attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1; attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; attr.cap.max_send_sge = rds_ibdev->max_sge; attr.cap.max_recv_sge = RDS_IB_RECV_SGE; @@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.qp_type = IB_QPT_RC; attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; + atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) */ wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && - (atomic_read(&ic->i_signaled_sends) == 0)); + (atomic_read(&ic->i_signaled_sends) == 0) && + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c new file mode 100644 index 000000000..4fe8f4fec --- /dev/null +++ b/net/rds/ib_fmr.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ib_mr.h" + +struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) +{ + struct rds_ib_mr_pool *pool; + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_fmr *fmr; + int err = 0; + + if (npages <= RDS_MR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + + ibmr = rds_ib_try_reuse_ibmr(pool); + if (ibmr) + return ibmr; + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, + rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + fmr = &ibmr->u.fmr; + fmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC), + &pool->fmr_attr); + if (IS_ERR(fmr->fmr)) { + err = PTR_ERR(fmr->fmr); + fmr->fmr = NULL; + pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err); + goto out_no_cigar; + } + + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + return ibmr; + +out_no_cigar: + if (ibmr) { + if (fmr->fmr) + ib_dealloc_fmr(fmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct rds_ib_fmr *fmr = &ibmr->u.fmr; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + pr_warn("RDS/IB: %s failed!\n", __func__); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + if (page_cnt > ibmr->pool->fmr_attr.max_pages) + return -EINVAL; + + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + + ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. + */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev, + struct scatterlist *sg, + unsigned long nents, + u32 *key) +{ + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_fmr *fmr; + int ret; + + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->device = rds_ibdev; + fmr = &ibmr->u.fmr; + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key = fmr->fmr->rkey; + else + rds_ib_free_mr(ibmr, 0); + + return ibmr; +} + +void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal) +{ + struct rds_ib_mr *ibmr, *next; + struct rds_ib_fmr *fmr; + LIST_HEAD(fmr_list); + int ret = 0; + unsigned int freed = *nfreed; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, list, unmap_list) { + fmr = &ibmr->u.fmr; + list_add(&fmr->fmr->list, &fmr_list); + } + + ret = ib_unmap_fmr(&fmr_list); + if (ret) + pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, list, unmap_list) { + fmr = &ibmr->u.fmr; + *unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (freed < goal || + ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) { + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); + list_del(&ibmr->unmap_list); + ib_dealloc_fmr(fmr->fmr); + kfree(ibmr); + freed++; + } + } + *nfreed = freed; +} + +void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); +} diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c new file mode 100644 index 000000000..93ff038ea --- /dev/null +++ b/net/rds/ib_frmr.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ib_mr.h" + +static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, + int npages) +{ + struct rds_ib_mr_pool *pool; + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_frmr *frmr; + int err = 0; + + if (npages <= RDS_MR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + + ibmr = rds_ib_try_reuse_ibmr(pool); + if (ibmr) + return ibmr; + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, + rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + frmr = &ibmr->u.frmr; + frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG, + pool->fmr_attr.max_pages); + if (IS_ERR(frmr->mr)) { + pr_warn("RDS/IB: %s failed to allocate MR", __func__); + goto out_no_cigar; + } + + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + if (atomic_read(&pool->item_count) > pool->max_items_soft) + pool->max_items_soft = pool->max_items; + + frmr->fr_state = FRMR_IS_FREE; + return ibmr; + +out_no_cigar: + kfree(ibmr); + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + + if (drop) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 5) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); +} + +static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + struct ib_send_wr *failed_wr; + struct ib_reg_wr reg_wr; + int ret; + + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); + cpu_relax(); + } + + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE); + if (unlikely(ret != ibmr->sg_len)) + return ret < 0 ? ret : -EINVAL; + + /* Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++); + frmr->fr_state = FRMR_IS_INUSE; + + memset(®_wr, 0, sizeof(reg_wr)); + reg_wr.wr.wr_id = (unsigned long)(void *)ibmr; + reg_wr.wr.opcode = IB_WR_REG_MR; + reg_wr.wr.num_sge = 0; + reg_wr.mr = frmr->mr; + reg_wr.key = frmr->mr->rkey; + reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + reg_wr.wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = ®_wr.wr; + ret = ib_post_send(ibmr->ic->i_cm_id->qp, ®_wr.wr, &failed_wr); + WARN_ON(failed_wr != ®_wr.wr); + if (unlikely(ret)) { + /* Failure here can be because of -ENOMEM as well */ + frmr->fr_state = FRMR_IS_STALE; + atomic_inc(&ibmr->ic->i_fastreg_wrs); + if (printk_ratelimit()) + pr_warn("RDS/IB: %s returned error(%d)\n", + __func__, ret); + } + return ret; +} + +static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_mr_pool *pool, + struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int sg_len) +{ + struct ib_device *dev = rds_ibdev->dev; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + int i; + u32 len; + int ret = 0; + + /* We want to teardown old ibmr values here and fill it up with + * new sg values + */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = sg; + ibmr->sg_len = sg_len; + ibmr->sg_dma_len = 0; + frmr->sg_byte_len = 0; + WARN_ON(ibmr->sg_dma_len); + ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + if (unlikely(!ibmr->sg_dma_len)) { + pr_warn("RDS/IB: %s failed!\n", __func__); + return -EBUSY; + } + + frmr->sg_byte_len = 0; + frmr->dma_npages = 0; + len = 0; + + ret = -EINVAL; + for (i = 0; i < ibmr->sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]); + u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]); + + frmr->sg_byte_len += dma_len; + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + goto out_unmap; + else + ++frmr->dma_npages; + } + + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < ibmr->sg_dma_len - 1) + goto out_unmap; + else + ++frmr->dma_npages; + } + + len += dma_len; + } + frmr->dma_npages += len >> PAGE_SHIFT; + + if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) { + ret = -EMSGSIZE; + goto out_unmap; + } + + ret = rds_ib_post_reg_frmr(ibmr); + if (ret) + goto out_unmap; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); + + return ret; + +out_unmap: + ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + return ret; +} + +static int rds_ib_post_inv(struct rds_ib_mr *ibmr) +{ + struct ib_send_wr *s_wr, *failed_wr; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id; + int ret = -EINVAL; + + if (!i_cm_id || !i_cm_id->qp || !frmr->mr) + goto out; + + if (frmr->fr_state != FRMR_IS_INUSE) + goto out; + + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); + cpu_relax(); + } + + frmr->fr_inv = true; + s_wr = &frmr->fr_wr; + + memset(s_wr, 0, sizeof(*s_wr)); + s_wr->wr_id = (unsigned long)(void *)ibmr; + s_wr->opcode = IB_WR_LOCAL_INV; + s_wr->ex.invalidate_rkey = frmr->mr->rkey; + s_wr->send_flags = IB_SEND_SIGNALED; + + failed_wr = s_wr; + ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr); + WARN_ON(failed_wr != s_wr); + if (unlikely(ret)) { + frmr->fr_state = FRMR_IS_STALE; + frmr->fr_inv = false; + atomic_inc(&ibmr->ic->i_fastreg_wrs); + pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); + goto out; + } +out: + return ret; +} + +void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) +{ + struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + + if (wc->status != IB_WC_SUCCESS) { + frmr->fr_state = FRMR_IS_STALE; + if (rds_conn_up(ic->conn)) + rds_ib_conn_error(ic->conn, + "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n", + &ic->conn->c_laddr, + &ic->conn->c_faddr, + wc->status, + ib_wc_status_msg(wc->status), + wc->vendor_err); + } + + if (frmr->fr_inv) { + frmr->fr_state = FRMR_IS_FREE; + frmr->fr_inv = false; + } + + atomic_inc(&ic->i_fastreg_wrs); +} + +void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal) +{ + struct rds_ib_mr *ibmr, *next; + struct rds_ib_frmr *frmr; + int ret = 0; + unsigned int freed = *nfreed; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, list, unmap_list) { + if (ibmr->sg_dma_len) + ret |= rds_ib_post_inv(ibmr); + } + if (ret) + pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, list, unmap_list) { + *unpinned += ibmr->sg_len; + frmr = &ibmr->u.frmr; + __rds_ib_teardown_mr(ibmr); + if (freed < goal || frmr->fr_state == FRMR_IS_STALE) { + /* Don't de-allocate if the MR is not free yet */ + if (frmr->fr_state == FRMR_IS_INUSE) + continue; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); + list_del(&ibmr->unmap_list); + if (frmr->mr) + ib_dereg_mr(frmr->mr); + kfree(ibmr); + freed++; + } + } + *nfreed = freed; +} + +struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_connection *ic, + struct scatterlist *sg, + unsigned long nents, u32 *key) +{ + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_frmr *frmr; + int ret; + + do { + if (ibmr) + rds_ib_free_frmr(ibmr, true); + ibmr = rds_ib_alloc_frmr(rds_ibdev, nents); + if (IS_ERR(ibmr)) + return ibmr; + frmr = &ibmr->u.frmr; + } while (frmr->fr_state != FRMR_IS_FREE); + + ibmr->ic = ic; + ibmr->device = rds_ibdev; + ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents); + if (ret == 0) { + *key = frmr->mr->rkey; + } else { + rds_ib_free_frmr(ibmr, false); + ibmr = ERR_PTR(ret); + } + + return ibmr; +} + +void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + + if (frmr->fr_state == FRMR_IS_STALE) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); +} diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h new file mode 100644 index 000000000..1c754f4ac --- /dev/null +++ b/net/rds/ib_mr.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _RDS_IB_MR_H +#define _RDS_IB_MR_H + +#include <linux/kernel.h> + +#include "rds.h" +#include "ib.h" + +#define RDS_MR_1M_POOL_SIZE (8192 / 2) +#define RDS_MR_1M_MSG_SIZE 256 +#define RDS_MR_8K_MSG_SIZE 2 +#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1)) +#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) + +struct rds_ib_fmr { + struct ib_fmr *fmr; + u64 *dma; +}; + +enum rds_ib_fr_state { + FRMR_IS_FREE, /* mr invalidated & ready for use */ + FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */ + FRMR_IS_STALE, /* Stale MR and needs to be dropped */ +}; + +struct rds_ib_frmr { + struct ib_mr *mr; + enum rds_ib_fr_state fr_state; + bool fr_inv; + struct ib_send_wr fr_wr; + unsigned int dma_npages; + unsigned int sg_byte_len; +}; + +/* This is stored as mr->r_trans_private. */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct rds_ib_connection *ic; + + struct llist_node llnode; + + /* unmap_list is for freeing */ + struct list_head unmap_list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + int sg_dma_len; + + union { + struct rds_ib_fmr fmr; + struct rds_ib_frmr frmr; + } u; +}; + +/* Our own little MR pool */ +struct rds_ib_mr_pool { + unsigned int pool_type; + struct mutex flush_lock; /* serialize fmr invalidate */ + struct delayed_work flush_worker; /* flush worker */ + + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + + struct llist_head drop_list; /* MRs not reached max_maps */ + struct llist_head free_list; /* unused MRs */ + struct llist_head clean_list; /* unused & unmapped MRs */ + wait_queue_head_t flush_wait; + + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; + bool use_fastreg; +}; + +extern struct workqueue_struct *rds_ib_mr_wq; +extern unsigned int rds_ib_mr_1m_pool_size; +extern unsigned int rds_ib_mr_8k_pool_size; +extern bool prefer_frmr; + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, + int npages); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); +int rds_ib_mr_init(void); +void rds_ib_mr_exit(void); + +void __rds_ib_teardown_mr(struct rds_ib_mr *); +void rds_ib_teardown_mr(struct rds_ib_mr *); +struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int); +int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *, + struct scatterlist *, unsigned int); +struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *); +int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **); +struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *, + unsigned long, u32 *); +struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *); +void rds_ib_unreg_fmr(struct list_head *, unsigned int *, + unsigned long *, unsigned int); +void rds_ib_free_fmr_list(struct rds_ib_mr *); +struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_connection *ic, + struct scatterlist *sg, + unsigned long nents, u32 *key); +void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal); +void rds_ib_free_frmr_list(struct rds_ib_mr *); +#endif diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a2340748e..f7164ac1f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -35,78 +35,13 @@ #include <linux/rculist.h> #include <linux/llist.h> -#include "rds.h" -#include "ib.h" +#include "ib_mr.h" + +struct workqueue_struct *rds_ib_mr_wq; static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 -/* - * This is stored as mr->r_trans_private. - */ -struct rds_ib_mr { - struct rds_ib_device *device; - struct rds_ib_mr_pool *pool; - struct ib_fmr *fmr; - - struct llist_node llnode; - - /* unmap_list is for freeing */ - struct list_head unmap_list; - unsigned int remap_count; - - struct scatterlist *sg; - unsigned int sg_len; - u64 *dma; - int sg_dma_len; -}; - -/* - * Our own little FMR pool - */ -struct rds_ib_mr_pool { - unsigned int pool_type; - struct mutex flush_lock; /* serialize fmr invalidate */ - struct delayed_work flush_worker; /* flush worker */ - - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - - struct llist_head drop_list; /* MRs that have reached their max_maps limit */ - struct llist_head free_list; /* unused MRs */ - struct llist_head clean_list; /* global unused & unamapped MRs */ - wait_queue_head_t flush_wait; - - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; - struct ib_fmr_attr fmr_attr; -}; - -static struct workqueue_struct *rds_ib_fmr_wq; - -int rds_ib_fmr_init(void) -{ - rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); - if (!rds_ib_fmr_wq) - return -ENOMEM; - return 0; -} - -/* By the time this is called all the IB devices should have been torn down and - * had their pools freed. As each pool is freed its work struct is waited on, - * so the pool flushing work queue should be idle by the time we get here. - */ -void rds_ib_fmr_exit(void) -{ - destroy_workqueue(rds_ib_fmr_wq); -} - -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); -static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); -static void rds_ib_mr_pool_flush_worker(struct work_struct *work); - static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; @@ -235,41 +170,6 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, - int pool_type) -{ - struct rds_ib_mr_pool *pool; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return ERR_PTR(-ENOMEM); - - pool->pool_type = pool_type; - init_llist_head(&pool->free_list); - init_llist_head(&pool->drop_list); - init_llist_head(&pool->clean_list); - mutex_init(&pool->flush_lock); - init_waitqueue_head(&pool->flush_wait); - INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - - if (pool_type == RDS_IB_MR_1M_POOL) { - /* +1 allows for unaligned MRs */ - pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; - pool->max_items = RDS_FMR_1M_POOL_SIZE; - } else { - /* pool_type == RDS_IB_MR_8K_POOL */ - pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; - pool->max_items = RDS_FMR_8K_POOL_SIZE; - } - - pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; - pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - - return pool; -} - void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; @@ -278,16 +178,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } -void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) -{ - cancel_delayed_work_sync(&pool->flush_worker); - rds_ib_flush_mr_pool(pool, 1, NULL); - WARN_ON(atomic_read(&pool->item_count)); - WARN_ON(atomic_read(&pool->free_pinned)); - kfree(pool); -} - -static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; @@ -297,8 +188,13 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) flag = this_cpu_ptr(&clean_list_grace); set_bit(CLEAN_LIST_BUSY_BIT, flag); ret = llist_del_first(&pool->clean_list); - if (ret) + if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); + } clear_bit(CLEAN_LIST_BUSY_BIT, flag); preempt_enable(); @@ -317,190 +213,6 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, - int npages) -{ - struct rds_ib_mr_pool *pool; - struct rds_ib_mr *ibmr = NULL; - int err = 0, iter = 0; - - if (npages <= RDS_FMR_8K_MSG_SIZE) - pool = rds_ibdev->mr_8k_pool; - else - pool = rds_ibdev->mr_1m_pool; - - if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); - - /* Switch pools if one of the pool is reaching upper limit */ - if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - pool = rds_ibdev->mr_1m_pool; - else - pool = rds_ibdev->mr_8k_pool; - } - - while (1) { - ibmr = rds_ib_reuse_fmr(pool); - if (ibmr) - return ibmr; - - /* No clean MRs - now we have the choice of either - * allocating a fresh MR up to the limit imposed by the - * driver, or flush any dirty unused MRs. - * We try to avoid stalling in the send path if possible, - * so we allocate as long as we're allowed to. - * - * We're fussy with enforcing the FMR limit, though. If the driver - * tells us we can't use more than N fmrs, we shouldn't start - * arguing with it */ - if (atomic_inc_return(&pool->item_count) <= pool->max_items) - break; - - atomic_dec(&pool->item_count); - - if (++iter > 2) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); - return ERR_PTR(-EAGAIN); - } - - /* We do have some empty MRs. Flush them out. */ - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); - rds_ib_flush_mr_pool(pool, 0, &ibmr); - if (ibmr) - return ibmr; - } - - ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE| - IB_ACCESS_REMOTE_ATOMIC), - &pool->fmr_attr); - if (IS_ERR(ibmr->fmr)) { - err = PTR_ERR(ibmr->fmr); - ibmr->fmr = NULL; - printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); - goto out_no_cigar; - } - - ibmr->pool = pool; - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); - - return ibmr; - -out_no_cigar: - if (ibmr) { - if (ibmr->fmr) - ib_dealloc_fmr(ibmr->fmr); - kfree(ibmr); - } - atomic_dec(&pool->item_count); - return ERR_PTR(err); -} - -static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, - struct scatterlist *sg, unsigned int nents) -{ - struct ib_device *dev = rds_ibdev->dev; - struct scatterlist *scat = sg; - u64 io_addr = 0; - u64 *dma_pages; - u32 len; - int page_cnt, sg_dma_len; - int i, j; - int ret; - - sg_dma_len = ib_dma_map_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - if (unlikely(!sg_dma_len)) { - printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); - return -EBUSY; - } - - len = 0; - page_cnt = 0; - - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - - if (dma_addr & ~PAGE_MASK) { - if (i > 0) - return -EINVAL; - else - ++page_cnt; - } - if ((dma_addr + dma_len) & ~PAGE_MASK) { - if (i < sg_dma_len - 1) - return -EINVAL; - else - ++page_cnt; - } - - len += dma_len; - } - - page_cnt += len >> PAGE_SHIFT; - if (page_cnt > ibmr->pool->fmr_attr.max_pages) - return -EINVAL; - - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); - if (!dma_pages) - return -ENOMEM; - - page_cnt = 0; - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - - for (j = 0; j < dma_len; j += PAGE_SIZE) - dma_pages[page_cnt++] = - (dma_addr & PAGE_MASK) + j; - } - - ret = ib_map_phys_fmr(ibmr->fmr, - dma_pages, page_cnt, io_addr); - if (ret) - goto out; - - /* Success - we successfully remapped the MR, so we can - * safely tear down the old mapping. */ - rds_ib_teardown_mr(ibmr); - - ibmr->sg = scat; - ibmr->sg_len = nents; - ibmr->sg_dma_len = sg_dma_len; - ibmr->remap_count++; - - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_used); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_used); - ret = 0; - -out: - kfree(dma_pages); - - return ret; -} - void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; @@ -518,7 +230,7 @@ void rds_ib_sync_mr(void *trans_private, int direction) } } -static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; @@ -549,7 +261,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) } } -static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; @@ -623,17 +335,15 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) +int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { - struct rds_ib_mr *ibmr, *next; + struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); - LIST_HEAD(fmr_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; - int ret = 0; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); @@ -643,7 +353,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); @@ -655,7 +365,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (llist_empty(&pool->clean_list)) schedule(); - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); @@ -667,7 +377,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, mutex_lock(&pool->flush_lock); if (ibmr_ret) { - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; @@ -687,30 +397,10 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (list_empty(&unmap_list)) goto out; - /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, unmap_list) - list_add(&ibmr->fmr->list, &fmr_list); - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); - - /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { - unpinned += ibmr->sg_len; - __rds_ib_teardown_mr(ibmr); - if (nfreed < free_goal || - ibmr->remap_count >= pool->fmr_attr.max_maps) { - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_free); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_free); - list_del(&ibmr->unmap_list); - ib_dealloc_fmr(ibmr->fmr); - kfree(ibmr); - nfreed++; - } - } + if (pool->use_fastreg) + rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); + else + rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { /* we have to make sure that none of the things we're about @@ -743,7 +433,47 @@ out: if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: - return ret; + return 0; +} + +struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + int iter = 0; + + if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); + + while (1) { + ibmr = rds_ib_reuse_mr(pool); + if (ibmr) + return ibmr; + + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); + + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; + } + + return ibmr; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) @@ -762,10 +492,10 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - if (ibmr->remap_count >= pool->fmr_attr.max_maps) - llist_add(&ibmr->llnode, &pool->drop_list); + if (rds_ibdev->use_fastreg) + rds_ib_free_frmr_list(ibmr); else - llist_add(&ibmr->llnode, &pool->free_list); + rds_ib_free_fmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); @@ -773,7 +503,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -782,7 +512,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_delayed_work(rds_ib_fmr_wq, + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } @@ -810,6 +540,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; + struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); @@ -823,29 +554,81 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); - if (IS_ERR(ibmr)) { - rds_ib_dev_put(rds_ibdev); - return ibmr; - } - - ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); - if (ret == 0) - *key_ret = ibmr->fmr->rkey; + if (rds_ibdev->use_fastreg) + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); else - printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); - - ibmr->device = rds_ibdev; - rds_ibdev = NULL; + ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); + if (ibmr) + rds_ibdev = NULL; out: - if (ret) { - if (ibmr) - rds_ib_free_mr(ibmr, 0); - ibmr = ERR_PTR(ret); - } + if (!ibmr) + pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); + if (rds_ibdev) rds_ib_dev_put(rds_ibdev); + return ibmr; } +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->pool_type = pool_type; + init_llist_head(&pool->free_list); + init_llist_head(&pool->drop_list); + init_llist_head(&pool->clean_list); + mutex_init(&pool->flush_lock); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + if (pool_type == RDS_IB_MR_1M_POOL) { + /* +1 allows for unaligned MRs */ + pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; + pool->max_items = RDS_MR_1M_POOL_SIZE; + } else { + /* pool_type == RDS_IB_MR_8K_POOL */ + pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; + pool->max_items = RDS_MR_8K_POOL_SIZE; + } + + pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; + pool->use_fastreg = rds_ibdev->use_fastreg; + + return pool; +} + +int rds_ib_mr_init(void) +{ + rds_ib_mr_wq = create_workqueue("rds_mr_flushd"); + if (!rds_ib_mr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_mr_exit(void) +{ + destroy_workqueue(rds_ib_mr_wq); +} diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 977fb8606..abc8cc805 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -796,7 +796,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, addr = kmap_atomic(sg_page(&frag->f_sg)); - src = addr + frag_off; + src = addr + frag->f_sg.offset + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; for (k = 0; k < to_copy; k += 8) { /* Record ports that became uncongested, ie diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index eac30bf48..f27d2c82b 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i | RDS_IB_SEND_OP; + send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; @@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) oldest = rds_ib_ring_oldest(&ic->i_send_ring); - completed = rds_ib_ring_completed(&ic->i_send_ring, - (wc->wr_id & ~RDS_IB_SEND_OP), - oldest); + completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest); for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d77e04473..7e78dca1f 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rdma_mr_1m_pool_flush", "ib_rdma_mr_1m_pool_wait", "ib_rdma_mr_1m_pool_depleted", + "ib_rdma_mr_8k_reused", + "ib_rdma_mr_1m_reused", "ib_atomic_cswp", "ib_atomic_fadd", }; diff --git a/net/rds/iw.c b/net/rds/iw.c deleted file mode 100644 index f4a9fff82..000000000 --- a/net/rds/iw.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_arp.h> -#include <linux/delay.h> -#include <linux/slab.h> -#include <linux/module.h> - -#include "rds.h" -#include "iw.h" - -unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; -unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ - -module_param(fastreg_pool_size, int, 0444); -MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); -module_param(fastreg_message_size, int, 0444); -MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); - -struct list_head rds_iw_devices; - -/* NOTE: if also grabbing iwdev lock, grab this first */ -DEFINE_SPINLOCK(iw_nodev_conns_lock); -LIST_HEAD(iw_nodev_conns); - -static void rds_iw_add_one(struct ib_device *device) -{ - struct rds_iw_device *rds_iwdev; - - /* Only handle iwarp devices */ - if (device->node_type != RDMA_NODE_RNIC) - return; - - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); - if (!rds_iwdev) - return; - - spin_lock_init(&rds_iwdev->spinlock); - - rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = device->attrs.max_qp_wr; - rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); - - rds_iwdev->dev = device; - rds_iwdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_iwdev->pd)) - goto free_dev; - - if (!rds_iwdev->dma_local_lkey) { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_iwdev->mr)) - goto err_pd; - } else - rds_iwdev->mr = NULL; - - rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); - if (IS_ERR(rds_iwdev->mr_pool)) { - rds_iwdev->mr_pool = NULL; - goto err_mr; - } - - INIT_LIST_HEAD(&rds_iwdev->cm_id_list); - INIT_LIST_HEAD(&rds_iwdev->conn_list); - list_add_tail(&rds_iwdev->list, &rds_iw_devices); - - ib_set_client_data(device, &rds_iw_client, rds_iwdev); - return; - -err_mr: - if (rds_iwdev->mr) - ib_dereg_mr(rds_iwdev->mr); -err_pd: - ib_dealloc_pd(rds_iwdev->pd); -free_dev: - kfree(rds_iwdev); -} - -static void rds_iw_remove_one(struct ib_device *device, void *client_data) -{ - struct rds_iw_device *rds_iwdev = client_data; - struct rds_iw_cm_id *i_cm_id, *next; - - if (!rds_iwdev) - return; - - spin_lock_irq(&rds_iwdev->spinlock); - list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { - list_del(&i_cm_id->list); - kfree(i_cm_id); - } - spin_unlock_irq(&rds_iwdev->spinlock); - - rds_iw_destroy_conns(rds_iwdev); - - if (rds_iwdev->mr_pool) - rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); - - if (rds_iwdev->mr) - ib_dereg_mr(rds_iwdev->mr); - - ib_dealloc_pd(rds_iwdev->pd); - - list_del(&rds_iwdev->list); - kfree(rds_iwdev); -} - -struct ib_client rds_iw_client = { - .name = "rds_iw", - .add = rds_iw_add_one, - .remove = rds_iw_remove_one -}; - -static int rds_iw_conn_info_visitor(struct rds_connection *conn, - void *buffer) -{ - struct rds_info_rdma_connection *iinfo = buffer; - struct rds_iw_connection *ic; - - /* We will only ever look at IB transports */ - if (conn->c_trans != &rds_iw_transport) - return 0; - - iinfo->src_addr = conn->c_laddr; - iinfo->dst_addr = conn->c_faddr; - - memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); - memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); - if (rds_conn_state(conn) == RDS_CONN_UP) { - struct rds_iw_device *rds_iwdev; - struct rdma_dev_addr *dev_addr; - - ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - iinfo->max_send_wr = ic->i_send_ring.w_nr; - iinfo->max_recv_wr = ic->i_recv_ring.w_nr; - iinfo->max_send_sge = rds_iwdev->max_sge; - rds_iw_get_mr_info(rds_iwdev, iinfo); - } - return 1; -} - -static void rds_iw_ic_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens) -{ - rds_for_each_conn_info(sock, len, iter, lens, - rds_iw_conn_info_visitor, - sizeof(struct rds_info_rdma_connection)); -} - - -/* - * Early RDS/IB was built to only bind to an address if there is an IPoIB - * device with that address set. - * - * If it were me, I'd advocate for something more flexible. Sending and - * receiving should be device-agnostic. Transports would try and maintain - * connections between peers who have messages queued. Userspace would be - * allowed to influence which paths have priority. We could call userspace - * asserting this policy "routing". - */ -static int rds_iw_laddr_check(struct net *net, __be32 addr) -{ - int ret; - struct rdma_cm_id *cm_id; - struct sockaddr_in sin; - - /* Create a CMA ID and try to bind it. This catches both - * IB and iWARP capable NICs. - */ - cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); - - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - - /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); - /* due to this, we will claim to support IB devices unless we - check node_type. */ - if (ret || !cm_id->device || - cm_id->device->node_type != RDMA_NODE_RNIC) - ret = -EADDRNOTAVAIL; - - rdsdebug("addr %pI4 ret %d node type %d\n", - &addr, ret, - cm_id->device ? cm_id->device->node_type : -1); - - rdma_destroy_id(cm_id); - - return ret; -} - -void rds_iw_exit(void) -{ - rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - rds_iw_destroy_nodev_conns(); - ib_unregister_client(&rds_iw_client); - rds_iw_sysctl_exit(); - rds_iw_recv_exit(); - rds_trans_unregister(&rds_iw_transport); -} - -struct rds_transport rds_iw_transport = { - .laddr_check = rds_iw_laddr_check, - .xmit_complete = rds_iw_xmit_complete, - .xmit = rds_iw_xmit, - .xmit_rdma = rds_iw_xmit_rdma, - .recv = rds_iw_recv, - .conn_alloc = rds_iw_conn_alloc, - .conn_free = rds_iw_conn_free, - .conn_connect = rds_iw_conn_connect, - .conn_shutdown = rds_iw_conn_shutdown, - .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_free = rds_iw_inc_free, - .cm_initiate_connect = rds_iw_cm_initiate_connect, - .cm_handle_connect = rds_iw_cm_handle_connect, - .cm_connect_complete = rds_iw_cm_connect_complete, - .stats_info_copy = rds_iw_stats_info_copy, - .exit = rds_iw_exit, - .get_mr = rds_iw_get_mr, - .sync_mr = rds_iw_sync_mr, - .free_mr = rds_iw_free_mr, - .flush_mrs = rds_iw_flush_mrs, - .t_owner = THIS_MODULE, - .t_name = "iwarp", - .t_type = RDS_TRANS_IWARP, - .t_prefer_loopback = 1, -}; - -int rds_iw_init(void) -{ - int ret; - - INIT_LIST_HEAD(&rds_iw_devices); - - ret = ib_register_client(&rds_iw_client); - if (ret) - goto out; - - ret = rds_iw_sysctl_init(); - if (ret) - goto out_ibreg; - - ret = rds_iw_recv_init(); - if (ret) - goto out_sysctl; - - ret = rds_trans_register(&rds_iw_transport); - if (ret) - goto out_recv; - - rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - - goto out; - -out_recv: - rds_iw_recv_exit(); -out_sysctl: - rds_iw_sysctl_exit(); -out_ibreg: - ib_unregister_client(&rds_iw_client); -out: - return ret; -} - -MODULE_LICENSE("GPL"); - diff --git a/net/rds/iw.h b/net/rds/iw.h deleted file mode 100644 index 5af01d175..000000000 --- a/net/rds/iw.h +++ /dev/null @@ -1,398 +0,0 @@ -#ifndef _RDS_IW_H -#define _RDS_IW_H - -#include <linux/interrupt.h> -#include <rdma/ib_verbs.h> -#include <rdma/rdma_cm.h> -#include "rds.h" -#include "rdma_transport.h" - -#define RDS_FASTREG_SIZE 20 -#define RDS_FASTREG_POOL_SIZE 2048 - -#define RDS_IW_MAX_SGE 8 -#define RDS_IW_RECV_SGE 2 - -#define RDS_IW_DEFAULT_RECV_WR 1024 -#define RDS_IW_DEFAULT_SEND_WR 256 - -#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ - -extern struct list_head rds_iw_devices; - -/* - * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to - * try and minimize the amount of memory tied up both the device and - * socket receive queues. - */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) -struct rds_page_frag { - struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; -}; - -struct rds_iw_incoming { - struct list_head ii_frags; - struct rds_incoming ii_inc; -}; - -struct rds_iw_connect_private { - /* Add new fields at the end, and don't permute existing fields. */ - __be32 dp_saddr; - __be32 dp_daddr; - u8 dp_protocol_major; - u8 dp_protocol_minor; - __be16 dp_protocol_minor_mask; /* bitmask */ - __be32 dp_reserved1; - __be64 dp_ack_seq; - __be32 dp_credit; /* non-zero enables flow ctl */ -}; - -struct rds_iw_scatterlist { - struct scatterlist *list; - unsigned int len; - int dma_len; - unsigned int dma_npages; - unsigned int bytes; -}; - -struct rds_iw_mapping { - spinlock_t m_lock; /* protect the mapping struct */ - struct list_head m_list; - struct rds_iw_mr *m_mr; - uint32_t m_rkey; - struct rds_iw_scatterlist m_sg; -}; - -struct rds_iw_send_work { - struct rds_message *s_rm; - - /* We should really put these into a union: */ - struct rm_rdma_op *s_op; - struct rds_iw_mapping *s_mapping; - struct ib_mr *s_mr; - unsigned char s_remap_count; - - union { - struct ib_send_wr s_send_wr; - struct ib_rdma_wr s_rdma_wr; - struct ib_reg_wr s_reg_wr; - }; - struct ib_sge s_sge[RDS_IW_MAX_SGE]; - unsigned long s_queued; -}; - -struct rds_iw_recv_work { - struct rds_iw_incoming *r_iwinc; - struct rds_page_frag *r_frag; - struct ib_recv_wr r_wr; - struct ib_sge r_sge[2]; -}; - -struct rds_iw_work_ring { - u32 w_nr; - u32 w_alloc_ptr; - u32 w_alloc_ctr; - u32 w_free_ptr; - atomic_t w_free_ctr; -}; - -struct rds_iw_device; - -struct rds_iw_connection { - - struct list_head iw_node; - struct rds_iw_device *rds_iwdev; - struct rds_connection *conn; - - /* alphabet soup, IBTA style */ - struct rdma_cm_id *i_cm_id; - struct ib_pd *i_pd; - struct ib_mr *i_mr; - struct ib_cq *i_send_cq; - struct ib_cq *i_recv_cq; - - /* tx */ - struct rds_iw_work_ring i_send_ring; - struct rds_message *i_rm; - struct rds_header *i_send_hdrs; - u64 i_send_hdrs_dma; - struct rds_iw_send_work *i_sends; - - /* rx */ - struct tasklet_struct i_recv_tasklet; - struct mutex i_recv_mutex; - struct rds_iw_work_ring i_recv_ring; - struct rds_iw_incoming *i_iwinc; - u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - u64 i_recv_hdrs_dma; - struct rds_iw_recv_work *i_recvs; - struct rds_page_frag i_frag; - u64 i_ack_recv; /* last ACK received */ - - /* sending acks */ - unsigned long i_ack_flags; -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_t i_ack_next; /* next ACK to send */ -#else - spinlock_t i_ack_lock; /* protect i_ack_next */ - u64 i_ack_next; /* next ACK to send */ -#endif - struct rds_header *i_ack; - struct ib_send_wr i_ack_wr; - struct ib_sge i_ack_sge; - u64 i_ack_dma; - unsigned long i_ack_queued; - - /* Flow control related information - * - * Our algorithm uses a pair variables that we need to access - * atomically - one for the send credits, and one posted - * recv credits we need to transfer to remote. - * Rather than protect them using a slow spinlock, we put both into - * a single atomic_t and update it using cmpxchg - */ - atomic_t i_credits; - - /* Protocol version specific information */ - unsigned int i_flowctl:1; /* enable/disable flow ctl */ - unsigned int i_dma_local_lkey:1; - unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ - /* Batched completions */ - unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; -}; - -/* This assumes that atomic_t is at least 32 bits */ -#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) -#define IB_GET_POST_CREDITS(v) ((v) >> 16) -#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) -#define IB_SET_POST_CREDITS(v) ((v) << 16) - -struct rds_iw_cm_id { - struct list_head list; - struct rdma_cm_id *cm_id; -}; - -struct rds_iw_device { - struct list_head list; - struct list_head cm_id_list; - struct list_head conn_list; - struct ib_device *dev; - struct ib_pd *pd; - struct ib_mr *mr; - struct rds_iw_mr_pool *mr_pool; - int max_sge; - unsigned int max_wrs; - unsigned int dma_local_lkey:1; - spinlock_t spinlock; /* protect the above */ -}; - -/* bits for i_ack_flags */ -#define IB_ACK_IN_FLIGHT 0 -#define IB_ACK_REQUESTED 1 - -/* Magic WR_ID for ACKs */ -#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) -#define RDS_IW_REG_WR_ID ((u64)0xefefefefefefefefULL) -#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) - -struct rds_iw_statistics { - uint64_t s_iw_connect_raced; - uint64_t s_iw_listen_closed_stale; - uint64_t s_iw_tx_cq_call; - uint64_t s_iw_tx_cq_event; - uint64_t s_iw_tx_ring_full; - uint64_t s_iw_tx_throttle; - uint64_t s_iw_tx_sg_mapping_failure; - uint64_t s_iw_tx_stalled; - uint64_t s_iw_tx_credit_updates; - uint64_t s_iw_rx_cq_call; - uint64_t s_iw_rx_cq_event; - uint64_t s_iw_rx_ring_empty; - uint64_t s_iw_rx_refill_from_cq; - uint64_t s_iw_rx_refill_from_thread; - uint64_t s_iw_rx_alloc_limit; - uint64_t s_iw_rx_credit_updates; - uint64_t s_iw_ack_sent; - uint64_t s_iw_ack_send_failure; - uint64_t s_iw_ack_send_delayed; - uint64_t s_iw_ack_send_piggybacked; - uint64_t s_iw_ack_received; - uint64_t s_iw_rdma_mr_alloc; - uint64_t s_iw_rdma_mr_free; - uint64_t s_iw_rdma_mr_used; - uint64_t s_iw_rdma_mr_pool_flush; - uint64_t s_iw_rdma_mr_pool_wait; - uint64_t s_iw_rdma_mr_pool_depleted; -}; - -extern struct workqueue_struct *rds_iw_wq; - -/* - * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h - * doesn't define it. - */ -static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) -{ - unsigned int i; - - for (i = 0; i < sg_dma_len; ++i) { - ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), - direction); - } -} -#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu - -static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) -{ - unsigned int i; - - for (i = 0; i < sg_dma_len; ++i) { - ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), - direction); - } -} -#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device - -static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) -{ - return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; -} - -/* ib.c */ -extern struct rds_transport rds_iw_transport; -extern struct ib_client rds_iw_client; - -extern unsigned int fastreg_pool_size; -extern unsigned int fastreg_message_size; - -extern spinlock_t iw_nodev_conns_lock; -extern struct list_head iw_nodev_conns; - -/* ib_cm.c */ -int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); -void rds_iw_conn_free(void *arg); -int rds_iw_conn_connect(struct rds_connection *conn); -void rds_iw_conn_shutdown(struct rds_connection *conn); -void rds_iw_state_change(struct sock *sk); -int rds_iw_listen_init(void); -void rds_iw_listen_stop(void); -void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); -int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); -int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); -void rds_iw_cm_connect_complete(struct rds_connection *conn, - struct rdma_cm_event *event); - - -#define rds_iw_conn_error(conn, fmt...) \ - __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) - -/* ib_rdma.c */ -int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); -void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_iw_destroy_nodev_conns(void) -{ - __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); -} -static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) -{ - __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); -} -struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); -void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); -void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); -void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret); -void rds_iw_sync_mr(void *trans_private, int dir); -void rds_iw_free_mr(void *trans_private, int invalidate); -void rds_iw_flush_mrs(void); - -/* ib_recv.c */ -int rds_iw_recv_init(void); -void rds_iw_recv_exit(void); -int rds_iw_recv(struct rds_connection *conn); -int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); -void rds_iw_inc_free(struct rds_incoming *inc); -int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); -void rds_iw_recv_tasklet_fn(unsigned long data); -void rds_iw_recv_init_ring(struct rds_iw_connection *ic); -void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); -void rds_iw_recv_init_ack(struct rds_iw_connection *ic); -void rds_iw_attempt_ack(struct rds_iw_connection *ic); -void rds_iw_ack_send_complete(struct rds_iw_connection *ic); -u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); - -/* ib_ring.c */ -void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); -void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); -u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); -void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); -void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); -int rds_iw_ring_empty(struct rds_iw_work_ring *ring); -int rds_iw_ring_low(struct rds_iw_work_ring *ring); -u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); -u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); -extern wait_queue_head_t rds_iw_ring_empty_wait; - -/* ib_send.c */ -void rds_iw_xmit_complete(struct rds_connection *conn); -int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); -void rds_iw_send_init_ring(struct rds_iw_connection *ic); -void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); -void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); -void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); -int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted, int max_posted); - -/* ib_stats.c */ -DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); -#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) -unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, - unsigned int avail); - -/* ib_sysctl.c */ -int rds_iw_sysctl_init(void); -void rds_iw_sysctl_exit(void); -extern unsigned long rds_iw_sysctl_max_send_wr; -extern unsigned long rds_iw_sysctl_max_recv_wr; -extern unsigned long rds_iw_sysctl_max_unsig_wrs; -extern unsigned long rds_iw_sysctl_max_unsig_bytes; -extern unsigned long rds_iw_sysctl_max_recv_allocation; -extern unsigned int rds_iw_sysctl_flow_control; - -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - */ -static inline struct ib_sge * -rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) -{ - return &sge[0]; -} - -static inline struct ib_sge * -rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) -{ - return &sge[1]; -} - -#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c deleted file mode 100644 index aea4c911b..000000000 --- a/net/rds/iw_cm.c +++ /dev/null @@ -1,769 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - -/* - * Set the selected protocol version - */ -static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) -{ - conn->c_version = version; -} - -/* - * Set up flow control - */ -static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (rds_iw_sysctl_flow_control && credits != 0) { - /* We're doing flow control */ - ic->i_flowctl = 1; - rds_iw_send_add_credits(conn, credits); - } else { - ic->i_flowctl = 0; - } -} - -/* - * Connection established. - * We get here for both outgoing and incoming connection. - */ -void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) -{ - const struct rds_iw_connect_private *dp = NULL; - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_device *rds_iwdev; - int err; - - if (event->param.conn.private_data_len) { - dp = event->param.conn.private_data; - - rds_iw_set_protocol(conn, - RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - } - - /* update ib_device with this local ipaddr & conn */ - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); - if (err) - printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); - rds_iw_add_conn(rds_iwdev, conn); - - /* If the peer gave us the last packet it saw, process this as if - * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); - - printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", - &conn->c_laddr, &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); - - rds_connect_complete(conn); -} - -static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, - struct rdma_conn_param *conn_param, - struct rds_iw_connect_private *dp, - u32 protocol_version) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; - - if (dp) { - memset(dp, 0, sizeof(*dp)); - dp->dp_saddr = conn->c_laddr; - dp->dp_daddr = conn->c_faddr; - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = rds_iw_piggyb_ack(ic); - - /* Advertise flow control */ - if (ic->i_flowctl) { - unsigned int credits; - - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); - dp->dp_credit = cpu_to_be32(credits); - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); - } - - conn_param->private_data = dp; - conn_param->private_data_len = sizeof(*dp); - } -} - -static void rds_iw_cq_event_handler(struct ib_event *event, void *data) -{ - rdsdebug("event %u data %p\n", event->event, data); -} - -static void rds_iw_qp_event_handler(struct ib_event *event, void *data) -{ - struct rds_connection *conn = data; - struct rds_iw_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); - - switch (event->event) { - case IB_EVENT_COMM_EST: - rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); - break; - case IB_EVENT_QP_REQ_ERR: - case IB_EVENT_QP_FATAL: - default: - rdsdebug("Fatal QP Event %u " - "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, - &conn->c_faddr); - rds_conn_drop(conn); - break; - } -} - -/* - * Create a QP - */ -static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, - struct rds_iw_device *rds_iwdev, - struct rds_iw_work_ring *send_ring, - void (*send_cq_handler)(struct ib_cq *, void *), - struct rds_iw_work_ring *recv_ring, - void (*recv_cq_handler)(struct ib_cq *, void *), - void *context) -{ - struct ib_device *dev = rds_iwdev->dev; - struct ib_cq_init_attr cq_attr = {}; - unsigned int send_size, recv_size; - int ret; - - /* The offset of 1 is to accommodate the additional ACK WR. */ - send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); - recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); - rds_iw_ring_resize(send_ring, send_size - 1); - rds_iw_ring_resize(recv_ring, recv_size - 1); - - memset(attr, 0, sizeof(*attr)); - attr->event_handler = rds_iw_qp_event_handler; - attr->qp_context = context; - attr->cap.max_send_wr = send_size; - attr->cap.max_recv_wr = recv_size; - attr->cap.max_send_sge = rds_iwdev->max_sge; - attr->cap.max_recv_sge = RDS_IW_RECV_SGE; - attr->sq_sig_type = IB_SIGNAL_REQ_WR; - attr->qp_type = IB_QPT_RC; - - cq_attr.cqe = send_size; - attr->send_cq = ib_create_cq(dev, send_cq_handler, - rds_iw_cq_event_handler, - context, &cq_attr); - if (IS_ERR(attr->send_cq)) { - ret = PTR_ERR(attr->send_cq); - attr->send_cq = NULL; - rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; - } - - cq_attr.cqe = recv_size; - attr->recv_cq = ib_create_cq(dev, recv_cq_handler, - rds_iw_cq_event_handler, - context, &cq_attr); - if (IS_ERR(attr->recv_cq)) { - ret = PTR_ERR(attr->recv_cq); - attr->recv_cq = NULL; - rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; - } - - ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); - if (ret) { - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - goto out; - } - - ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); - if (ret) { - rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); - goto out; - } - -out: - if (ret) { - if (attr->send_cq) - ib_destroy_cq(attr->send_cq); - if (attr->recv_cq) - ib_destroy_cq(attr->recv_cq); - } - return ret; -} - -/* - * This needs to be very careful to not leave IS_ERR pointers around for - * cleanup to trip over. - */ -static int rds_iw_setup_qp(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_device *dev = ic->i_cm_id->device; - struct ib_qp_init_attr attr; - struct rds_iw_device *rds_iwdev; - int ret; - - /* rds_iw_add_one creates a rds_iw_device object per IB device, - * and allocates a protection domain, memory range and MR pool - * for each. If that fails for any reason, it will not register - * the rds_iwdev at all. - */ - rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (!rds_iwdev) { - printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n", - dev->name); - return -EOPNOTSUPP; - } - - /* Protection domain and memory range */ - ic->i_pd = rds_iwdev->pd; - ic->i_mr = rds_iwdev->mr; - - ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, - &ic->i_send_ring, rds_iw_send_cq_comp_handler, - &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, - conn); - if (ret < 0) - goto out; - - ic->i_send_cq = attr.send_cq; - ic->i_recv_cq = attr.recv_cq; - - /* - * XXX this can fail if max_*_wr is too large? Are we supposed - * to back off until we get a value that the hardware can support? - */ - ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); - if (ret) { - rdsdebug("rdma_create_qp failed: %d\n", ret); - goto out; - } - - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); - if (!ic->i_send_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); - goto out; - } - - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (!ic->i_recv_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; - } - - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); - if (!ic->i_ack) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); - goto out; - } - - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (!ic->i_sends) { - ret = -ENOMEM; - rdsdebug("send allocation failed\n"); - goto out; - } - rds_iw_send_init_ring(ic); - - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (!ic->i_recvs) { - ret = -ENOMEM; - rdsdebug("recv allocation failed\n"); - goto out; - } - - rds_iw_recv_init_ring(ic); - rds_iw_recv_init_ack(ic); - - /* Post receive buffers - as a side effect, this will update - * the posted credit count. */ - rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); - - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, - ic->i_send_cq, ic->i_recv_cq); - -out: - return ret; -} - -static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) -{ - u16 common; - u32 version = 0; - - /* rdma_cm private data is odd - when there is any private data in the - * request, we will be given a pretty large buffer without telling us the - * original size. The only way to tell the difference is by looking at - * the contents, which are initialized to zero. - * If the protocol version fields aren't set, this is a connection attempt - * from an older version. This could could be 3.0 or 2.0 - we can't tell. - * We really should have changed this for OFED 1.3 :-( */ - if (dp->dp_protocol_major == 0) - return RDS_PROTOCOL_3_0; - - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; - if (dp->dp_protocol_major == 3 && common) { - version = RDS_PROTOCOL_3_0; - while ((common >>= 1) != 0) - version++; - } - printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using " - "incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); - return version; -} - -int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) -{ - const struct rds_iw_connect_private *dp = event->param.conn.private_data; - struct rds_iw_connect_private dp_rep; - struct rds_connection *conn = NULL; - struct rds_iw_connection *ic = NULL; - struct rdma_conn_param conn_param; - struct rds_iw_device *rds_iwdev; - u32 version; - int err, destroy = 1; - - /* Check whether the remote protocol version matches ours. */ - version = rds_iw_protocol_compatible(dp); - if (!version) - goto out; - - rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", - &dp->dp_saddr, &dp->dp_daddr, - RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); - - /* RDS/IW is not currently netns aware, thus init_net */ - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, - &rds_iw_transport, GFP_KERNEL); - if (IS_ERR(conn)) { - rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); - conn = NULL; - goto out; - } - - /* - * The connection request may occur while the - * previous connection exist, e.g. in case of failover. - * But as connections may be initiated simultaneously - * by both hosts, we have a random backoff mechanism - - * see the comment above rds_queue_reconnect() - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - if (rds_conn_state(conn) == RDS_CONN_UP) { - rdsdebug("incoming connect while connecting\n"); - rds_conn_drop(conn); - rds_iw_stats_inc(s_iw_listen_closed_stale); - } else - if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { - /* Wait and see - our connect may still be succeeding */ - rds_iw_stats_inc(s_iw_connect_raced); - } - mutex_unlock(&conn->c_cm_lock); - goto out; - } - - ic = conn->c_transport_data; - - rds_iw_set_protocol(conn, version); - rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - - /* If the peer gave us the last packet it saw, process this as if - * we had received a regular ACK. */ - if (dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); - - BUG_ON(cm_id->context); - BUG_ON(ic->i_cm_id); - - ic->i_cm_id = cm_id; - cm_id->context = conn; - - rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); - ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; - - /* We got halfway through setting up the ib_connection, if we - * fail now, we have to take the long route out of this mess. */ - destroy = 0; - - err = rds_iw_setup_qp(conn); - if (err) { - rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); - mutex_unlock(&conn->c_cm_lock); - goto out; - } - - rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); - - /* rdma_accept() calls rdma_reject() internally if it fails */ - err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { - rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; - -out: - rdma_reject(cm_id, NULL, 0); - return destroy; -} - - -int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) -{ - struct rds_connection *conn = cm_id->context; - struct rds_iw_connection *ic = conn->c_transport_data; - struct rdma_conn_param conn_param; - struct rds_iw_connect_private dp; - int ret; - - /* If the peer doesn't do protocol negotiation, we must - * default to RDSv3.0 */ - rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); - ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ - - ret = rds_iw_setup_qp(conn); - if (ret) { - rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); - goto out; - } - - rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - - ret = rdma_connect(cm_id, &conn_param); - if (ret) - rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); - -out: - /* Beware - returning non-zero tells the rdma_cm to destroy - * the cm_id. We should certainly not do it as long as we still - * "own" the cm_id. */ - if (ret) { - struct rds_iw_connection *ic = conn->c_transport_data; - - if (ic->i_cm_id == cm_id) - ret = 0; - } - return ret; -} - -int rds_iw_conn_connect(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_device *rds_iwdev; - struct sockaddr_in src, dest; - int ret; - - /* XXX I wonder what affect the port space has */ - /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(ic->i_cm_id)) { - ret = PTR_ERR(ic->i_cm_id); - ic->i_cm_id = NULL; - rdsdebug("rdma_create_id() failed: %d\n", ret); - goto out; - } - - rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); - - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); - - /* First, bind to the local address and device. */ - ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); - if (ret) { - rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", - &conn->c_laddr, ret); - rdma_destroy_id(ic->i_cm_id); - ic->i_cm_id = NULL; - goto out; - } - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; - - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_PORT); - - ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, - (struct sockaddr *)&dest, - RDS_RDMA_RESOLVE_TIMEOUT_MS); - if (ret) { - rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, - ret); - rdma_destroy_id(ic->i_cm_id); - ic->i_cm_id = NULL; - } - -out: - return ret; -} - -/* - * This is so careful about only cleaning up resources that were built up - * so that it can be called at any point during startup. In fact it - * can be called multiple times for a given connection. - */ -void rds_iw_conn_shutdown(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - int err = 0; - struct ib_qp_attr qp_attr; - - rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, - ic->i_pd, ic->i_send_cq, ic->i_recv_cq, - ic->i_cm_id ? ic->i_cm_id->qp : NULL); - - if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); - err = rdma_disconnect(ic->i_cm_id); - if (err) { - /* Actually this may happen quite frequently, when - * an outgoing connect raced with an incoming connect. - */ - rdsdebug("failed to disconnect, cm: %p err %d\n", - ic->i_cm_id, err); - } - - if (ic->i_cm_id->qp) { - qp_attr.qp_state = IB_QPS_ERR; - ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); - } - - wait_event(rds_iw_ring_empty_wait, - rds_iw_ring_empty(&ic->i_send_ring) && - rds_iw_ring_empty(&ic->i_recv_ring)); - - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); - - if (ic->i_sends) - rds_iw_send_clear_ring(ic); - if (ic->i_recvs) - rds_iw_recv_clear_ring(ic); - - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); - - /* - * If associated with an rds_iw_device: - * Move connection back to the nodev list. - * Remove cm_id from the device cm_id list. - */ - if (ic->rds_iwdev) - rds_iw_remove_conn(ic->rds_iwdev, conn); - - rdma_destroy_id(ic->i_cm_id); - - ic->i_cm_id = NULL; - ic->i_pd = NULL; - ic->i_mr = NULL; - ic->i_send_cq = NULL; - ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; - } - BUG_ON(ic->rds_iwdev); - - /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; - } - - /* Clear the ACK state */ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_set(&ic->i_ack_next, 0); -#else - ic->i_ack_next = 0; -#endif - ic->i_ack_recv = 0; - - /* Clear flow control state */ - ic->i_flowctl = 0; - atomic_set(&ic->i_credits, 0); - - rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); - rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); - - if (ic->i_iwinc) { - rds_inc_put(&ic->i_iwinc->ii_inc); - ic->i_iwinc = NULL; - } - - vfree(ic->i_sends); - ic->i_sends = NULL; - vfree(ic->i_recvs); - ic->i_recvs = NULL; - rdsdebug("shutdown complete\n"); -} - -int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) -{ - struct rds_iw_connection *ic; - unsigned long flags; - - /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_iw_connection), gfp); - if (!ic) - return -ENOMEM; - - INIT_LIST_HEAD(&ic->iw_node); - tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn, - (unsigned long) ic); - mutex_init(&ic->i_recv_mutex); -#ifndef KERNEL_HAS_ATOMIC64 - spin_lock_init(&ic->i_ack_lock); -#endif - - /* - * rds_iw_conn_shutdown() waits for these to be emptied so they - * must be initialized before it can be called. - */ - rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); - rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); - - ic->conn = conn; - conn->c_transport_data = ic; - - spin_lock_irqsave(&iw_nodev_conns_lock, flags); - list_add_tail(&ic->iw_node, &iw_nodev_conns); - spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); - - - rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); - return 0; -} - -/* - * Free a connection. Connection must be shut down and not set for reconnect. - */ -void rds_iw_conn_free(void *arg) -{ - struct rds_iw_connection *ic = arg; - spinlock_t *lock_ptr; - - rdsdebug("ic %p\n", ic); - - /* - * Conn is either on a dev's list or on the nodev list. - * A race with shutdown() or connect() would cause problems - * (since rds_iwdev would change) but that should never happen. - */ - lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; - - spin_lock_irq(lock_ptr); - list_del(&ic->iw_node); - spin_unlock_irq(lock_ptr); - - kfree(ic); -} - -/* - * An error occurred on the connection - */ -void -__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) -{ - va_list ap; - - rds_conn_drop(conn); - - va_start(ap, fmt); - vprintk(fmt, ap); - va_end(ap); -} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c deleted file mode 100644 index b09a40c1a..000000000 --- a/net/rds/iw_rdma.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - - -/* - * This is stored as mr->r_trans_private. - */ -struct rds_iw_mr { - struct rds_iw_device *device; - struct rds_iw_mr_pool *pool; - struct rdma_cm_id *cm_id; - - struct ib_mr *mr; - - struct rds_iw_mapping mapping; - unsigned char remap_count; -}; - -/* - * Our own little MR pool - */ -struct rds_iw_mr_pool { - struct rds_iw_device *device; /* back ptr to the device that owns us */ - - struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ - - spinlock_t list_lock; /* protect variables below */ - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - struct list_head dirty_list; /* dirty mappings */ - struct list_head clean_list; /* unused & unamapped MRs */ - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_message_size; /* in pages */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; - int max_pages; -}; - -static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); -static void rds_iw_mr_pool_flush_worker(struct work_struct *work); -static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, unsigned int nents); -static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, - struct list_head *unmap_list, - struct list_head *kill_list, - int *unpinned); -static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); - -static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, - struct rds_iw_device **rds_iwdev, - struct rdma_cm_id **cm_id) -{ - struct rds_iw_device *iwdev; - struct rds_iw_cm_id *i_cm_id; - - *rds_iwdev = NULL; - *cm_id = NULL; - - list_for_each_entry(iwdev, &rds_iw_devices, list) { - spin_lock_irq(&iwdev->spinlock); - list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { - struct sockaddr_in *src_addr, *dst_addr; - - src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; - dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; - - rdsdebug("local ipaddr = %x port %d, " - "remote ipaddr = %x port %d" - "..looking for %x port %d, " - "remote ipaddr = %x port %d\n", - src_addr->sin_addr.s_addr, - src_addr->sin_port, - dst_addr->sin_addr.s_addr, - dst_addr->sin_port, - src->sin_addr.s_addr, - src->sin_port, - dst->sin_addr.s_addr, - dst->sin_port); -#ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && - src_addr->sin_port == src->sin_port && - dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && - dst_addr->sin_port == dst->sin_port) { -#else - /* FIXME - needs to compare the local and remote - * ipaddr/port tuple, but the ipaddr is the only - * available information in the rds_sock (as the rest are - * zero'ed. It doesn't appear to be properly populated - * during connection setup... - */ - if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { -#endif - spin_unlock_irq(&iwdev->spinlock); - *rds_iwdev = iwdev; - *cm_id = i_cm_id->cm_id; - return 0; - } - } - spin_unlock_irq(&iwdev->spinlock); - } - - return 1; -} - -static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) -{ - struct rds_iw_cm_id *i_cm_id; - - i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); - if (!i_cm_id) - return -ENOMEM; - - i_cm_id->cm_id = cm_id; - - spin_lock_irq(&rds_iwdev->spinlock); - list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); - spin_unlock_irq(&rds_iwdev->spinlock); - - return 0; -} - -static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, - struct rdma_cm_id *cm_id) -{ - struct rds_iw_cm_id *i_cm_id; - - spin_lock_irq(&rds_iwdev->spinlock); - list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { - if (i_cm_id->cm_id == cm_id) { - list_del(&i_cm_id->list); - kfree(i_cm_id); - break; - } - } - spin_unlock_irq(&rds_iwdev->spinlock); -} - - -int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) -{ - struct sockaddr_in *src_addr, *dst_addr; - struct rds_iw_device *rds_iwdev_old; - struct rdma_cm_id *pcm_id; - int rc; - - src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; - dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - - rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); - if (rc) - rds_iw_remove_cm_id(rds_iwdev, cm_id); - - return rds_iw_add_cm_id(rds_iwdev, cm_id); -} - -void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* conn was previously on the nodev_conns_list */ - spin_lock_irq(&iw_nodev_conns_lock); - BUG_ON(list_empty(&iw_nodev_conns)); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - - spin_lock(&rds_iwdev->spinlock); - list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock(&rds_iwdev->spinlock); - spin_unlock_irq(&iw_nodev_conns_lock); - - ic->rds_iwdev = rds_iwdev; -} - -void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* place conn on nodev_conns_list */ - spin_lock(&iw_nodev_conns_lock); - - spin_lock_irq(&rds_iwdev->spinlock); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - spin_unlock_irq(&rds_iwdev->spinlock); - - list_add_tail(&ic->iw_node, &iw_nodev_conns); - - spin_unlock(&iw_nodev_conns_lock); - - rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); - ic->rds_iwdev = NULL; -} - -void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) -{ - struct rds_iw_connection *ic, *_ic; - LIST_HEAD(tmp_list); - - /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); - - list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) - rds_conn_destroy(ic->conn); -} - -static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, - struct scatterlist *list, unsigned int sg_len) -{ - sg->list = list; - sg->len = sg_len; - sg->dma_len = 0; - sg->dma_npages = 0; - sg->bytes = 0; -} - -static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, - struct rds_iw_scatterlist *sg) -{ - struct ib_device *dev = rds_iwdev->dev; - int i, ret; - - WARN_ON(sg->dma_len); - - sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); - if (unlikely(!sg->dma_len)) { - printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); - return -EBUSY; - } - - sg->bytes = 0; - sg->dma_npages = 0; - - ret = -EINVAL; - for (i = 0; i < sg->dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); - u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); - u64 end_addr; - - sg->bytes += dma_len; - - end_addr = dma_addr + dma_len; - if (dma_addr & PAGE_MASK) { - if (i > 0) - goto out_unmap; - dma_addr &= ~PAGE_MASK; - } - if (end_addr & PAGE_MASK) { - if (i < sg->dma_len - 1) - goto out_unmap; - end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK; - } - - sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT; - } - - /* Now gather the dma addrs into one list */ - if (sg->dma_npages > fastreg_message_size) - goto out_unmap; - - - - return 0; - -out_unmap: - ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); - sg->dma_len = 0; - return ret; -} - - -struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) -{ - struct rds_iw_mr_pool *pool; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); - return ERR_PTR(-ENOMEM); - } - - pool->device = rds_iwdev; - INIT_LIST_HEAD(&pool->dirty_list); - INIT_LIST_HEAD(&pool->clean_list); - mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); - INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); - - pool->max_message_size = fastreg_message_size; - pool->max_items = fastreg_pool_size; - pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; - pool->max_pages = fastreg_message_size; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ - pool->max_items_soft = pool->max_items * 3 / 4; - - return pool; -} - -void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) -{ - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->max_pages; -} - -void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) -{ - flush_workqueue(rds_wq); - rds_iw_flush_mr_pool(pool, 1); - BUG_ON(atomic_read(&pool->item_count)); - BUG_ON(atomic_read(&pool->free_pinned)); - kfree(pool); -} - -static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) -{ - struct rds_iw_mr *ibmr = NULL; - unsigned long flags; - - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); - list_del_init(&ibmr->mapping.m_list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); - - return ibmr; -} - -static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) -{ - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - struct rds_iw_mr *ibmr = NULL; - int err = 0, iter = 0; - - while (1) { - ibmr = rds_iw_reuse_fmr(pool); - if (ibmr) - return ibmr; - - /* No clean MRs - now we have the choice of either - * allocating a fresh MR up to the limit imposed by the - * driver, or flush any dirty unused MRs. - * We try to avoid stalling in the send path if possible, - * so we allocate as long as we're allowed to. - * - * We're fussy with enforcing the FMR limit, though. If the driver - * tells us we can't use more than N fmrs, we shouldn't start - * arguing with it */ - if (atomic_inc_return(&pool->item_count) <= pool->max_items) - break; - - atomic_dec(&pool->item_count); - - if (++iter > 2) { - rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); - return ERR_PTR(-EAGAIN); - } - - /* We do have some empty MRs. Flush them out. */ - rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); - rds_iw_flush_mr_pool(pool, 0); - } - - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - spin_lock_init(&ibmr->mapping.m_lock); - INIT_LIST_HEAD(&ibmr->mapping.m_list); - ibmr->mapping.m_mr = ibmr; - - err = rds_iw_init_reg(pool, ibmr); - if (err) - goto out_no_cigar; - - rds_iw_stats_inc(s_iw_rdma_mr_alloc); - return ibmr; - -out_no_cigar: - if (ibmr) { - rds_iw_destroy_fastreg(pool, ibmr); - kfree(ibmr); - } - atomic_dec(&pool->item_count); - return ERR_PTR(err); -} - -void rds_iw_sync_mr(void *trans_private, int direction) -{ - struct rds_iw_mr *ibmr = trans_private; - struct rds_iw_device *rds_iwdev = ibmr->device; - - switch (direction) { - case DMA_FROM_DEVICE: - ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, - ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); - break; - case DMA_TO_DEVICE: - ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, - ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); - break; - } -} - -/* - * Flush our pool of MRs. - * At a minimum, all currently unused MRs are unmapped. - * If the number of MRs allocated exceeds the limit, we also try - * to free as many MRs as needed to get back to this limit. - */ -static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) -{ - struct rds_iw_mr *ibmr, *next; - LIST_HEAD(unmap_list); - LIST_HEAD(kill_list); - unsigned long flags; - unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; - - rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); - - mutex_lock(&pool->flush_lock); - - spin_lock_irqsave(&pool->list_lock, flags); - /* Get the list of all mappings to be destroyed */ - list_splice_init(&pool->dirty_list, &unmap_list); - if (free_all) - list_splice_init(&pool->clean_list, &kill_list); - spin_unlock_irqrestore(&pool->list_lock, flags); - - /* Batched invalidate of dirty MRs. - * For FMR based MRs, the mappings on the unmap list are - * actually members of an ibmr (ibmr->mapping). They either - * migrate to the kill_list, or have been cleaned and should be - * moved to the clean_list. - * For fastregs, they will be dynamically allocated, and - * will be destroyed by the unmap function. - */ - if (!list_empty(&unmap_list)) { - ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, - &kill_list, &unpinned); - /* If we've been asked to destroy all MRs, move those - * that were simply cleaned to the kill list */ - if (free_all) - list_splice_init(&unmap_list, &kill_list); - } - - /* Destroy any MRs that are past their best before date */ - list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { - rds_iw_stats_inc(s_iw_rdma_mr_free); - list_del(&ibmr->mapping.m_list); - rds_iw_destroy_fastreg(pool, ibmr); - kfree(ibmr); - nfreed++; - } - - /* Anything that remains are laundered ibmrs, which we can add - * back to the clean list. */ - if (!list_empty(&unmap_list)) { - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); - } - - atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); - atomic_sub(nfreed, &pool->item_count); - - mutex_unlock(&pool->flush_lock); -} - -static void rds_iw_mr_pool_flush_worker(struct work_struct *work) -{ - struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); - - rds_iw_flush_mr_pool(pool, 0); -} - -void rds_iw_free_mr(void *trans_private, int invalidate) -{ - struct rds_iw_mr *ibmr = trans_private; - struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; - - rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); - if (!pool) - return; - - /* Return it to the pool's free list */ - rds_iw_free_fastreg(pool, ibmr); - - /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); - - if (invalidate) { - if (likely(!in_interrupt())) { - rds_iw_flush_mr_pool(pool, 0); - } else { - /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); - } - } -} - -void rds_iw_flush_mrs(void) -{ - struct rds_iw_device *rds_iwdev; - - list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - - if (pool) - rds_iw_flush_mr_pool(pool, 0); - } -} - -void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret) -{ - struct rds_iw_device *rds_iwdev; - struct rds_iw_mr *ibmr = NULL; - struct rdma_cm_id *cm_id; - struct sockaddr_in src = { - .sin_addr.s_addr = rs->rs_bound_addr, - .sin_port = rs->rs_bound_port, - }; - struct sockaddr_in dst = { - .sin_addr.s_addr = rs->rs_conn_addr, - .sin_port = rs->rs_conn_port, - }; - int ret; - - ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); - if (ret || !cm_id) { - ret = -ENODEV; - goto out; - } - - if (!rds_iwdev->mr_pool) { - ret = -ENODEV; - goto out; - } - - ibmr = rds_iw_alloc_mr(rds_iwdev); - if (IS_ERR(ibmr)) - return ibmr; - - ibmr->cm_id = cm_id; - ibmr->device = rds_iwdev; - - ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents); - if (ret == 0) - *key_ret = ibmr->mr->rkey; - else - printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); - -out: - if (ret) { - if (ibmr) - rds_iw_free_mr(ibmr, 0); - ibmr = ERR_PTR(ret); - } - return ibmr; -} - -/* - * iWARP reg handling - * - * The life cycle of a fastreg registration is a bit different from - * FMRs. - * The idea behind fastreg is to have one MR, to which we bind different - * mappings over time. To avoid stalling on the expensive map and invalidate - * operations, these operations are pipelined on the same send queue on - * which we want to send the message containing the r_key. - * - * This creates a bit of a problem for us, as we do not have the destination - * IP in GET_MR, so the connection must be setup prior to the GET_MR call for - * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit - * will try to queue a LOCAL_INV (if needed) and a REG_MR work request - * before queuing the SEND. When completions for these arrive, they are - * dispatched to the MR has a bit set showing that RDMa can be performed. - * - * There is another interesting aspect that's related to invalidation. - * The application can request that a mapping is invalidated in FREE_MR. - * The expectation there is that this invalidation step includes ALL - * PREVIOUSLY FREED MRs. - */ -static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - struct rds_iw_device *rds_iwdev = pool->device; - struct ib_mr *mr; - int err; - - mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG, - pool->max_message_size); - if (IS_ERR(mr)) { - err = PTR_ERR(mr); - - printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err); - return err; - } - - ibmr->mr = mr; - return 0; -} - -static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping) -{ - struct rds_iw_mr *ibmr = mapping->m_mr; - struct rds_iw_scatterlist *m_sg = &mapping->m_sg; - struct ib_reg_wr reg_wr; - struct ib_send_wr *failed_wr; - int ret, n; - - n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE); - if (unlikely(n != m_sg->len)) - return n < 0 ? n : -EINVAL; - - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = RDS_IW_REG_WR_ID; - reg_wr.wr.num_sge = 0; - reg_wr.mr = ibmr->mr; - reg_wr.key = mapping->m_rkey; - reg_wr.access = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; - - /* - * Perform a WR for the reg_mr. Each individual page - * in the sg list is added to the fast reg page list and placed - * inside the reg_mr WR. The key used is a rolling 8bit - * counter, which should guarantee uniqueness. - */ - ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); - mapping->m_rkey = ibmr->mr->rkey; - - failed_wr = ®_wr.wr; - ret = ib_post_send(ibmr->cm_id->qp, ®_wr.wr, &failed_wr); - BUG_ON(failed_wr != ®_wr.wr); - if (ret) - printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", - __func__, __LINE__, ret); - return ret; -} - -static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) -{ - struct ib_send_wr s_wr, *failed_wr; - int ret = 0; - - if (!ibmr->cm_id->qp || !ibmr->mr) - goto out; - - memset(&s_wr, 0, sizeof(s_wr)); - s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; - s_wr.opcode = IB_WR_LOCAL_INV; - s_wr.ex.invalidate_rkey = ibmr->mr->rkey; - s_wr.send_flags = IB_SEND_SIGNALED; - - failed_wr = &s_wr; - ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); - if (ret) { - printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", - __func__, __LINE__, ret); - goto out; - } -out: - return ret; -} - -static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, - unsigned int sg_len) -{ - struct rds_iw_device *rds_iwdev = pool->device; - struct rds_iw_mapping *mapping = &ibmr->mapping; - u64 *dma_pages; - int ret = 0; - - rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); - - ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); - if (ret) { - dma_pages = NULL; - goto out; - } - - if (mapping->m_sg.dma_len > pool->max_message_size) { - ret = -EMSGSIZE; - goto out; - } - - ret = rds_iw_rdma_reg_mr(mapping); - if (ret) - goto out; - - rds_iw_stats_inc(s_iw_rdma_mr_used); - -out: - kfree(dma_pages); - - return ret; -} - -/* - * "Free" a fastreg MR. - */ -static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - unsigned long flags; - int ret; - - if (!ibmr->mapping.m_sg.dma_len) - return; - - ret = rds_iw_rdma_fastreg_inv(ibmr); - if (ret) - return; - - /* Try to post the LOCAL_INV WR to the queue. */ - spin_lock_irqsave(&pool->list_lock, flags); - - list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); - atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); - atomic_inc(&pool->dirty_count); - - spin_unlock_irqrestore(&pool->list_lock, flags); -} - -static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, - struct list_head *unmap_list, - struct list_head *kill_list, - int *unpinned) -{ - struct rds_iw_mapping *mapping, *next; - unsigned int ncleaned = 0; - LIST_HEAD(laundered); - - /* Batched invalidation of fastreg MRs. - * Why do we do it this way, even though we could pipeline unmap - * and remap? The reason is the application semantics - when the - * application requests an invalidation of MRs, it expects all - * previously released R_Keys to become invalid. - * - * If we implement MR reuse naively, we risk memory corruption - * (this has actually been observed). So the default behavior - * requires that a MR goes through an explicit unmap operation before - * we can reuse it again. - * - * We could probably improve on this a little, by allowing immediate - * reuse of a MR on the same socket (eg you could add small - * cache of unused MRs to strct rds_socket - GET_MR could grab one - * of these without requiring an explicit invalidate). - */ - while (!list_empty(unmap_list)) { - unsigned long flags; - - spin_lock_irqsave(&pool->list_lock, flags); - list_for_each_entry_safe(mapping, next, unmap_list, m_list) { - *unpinned += mapping->m_sg.len; - list_move(&mapping->m_list, &laundered); - ncleaned++; - } - spin_unlock_irqrestore(&pool->list_lock, flags); - } - - /* Move all laundered mappings back to the unmap list. - * We do not kill any WRs right now - it doesn't seem the - * fastreg API has a max_remap limit. */ - list_splice_init(&laundered, unmap_list); - - return ncleaned; -} - -static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - if (ibmr->mr) - ib_dereg_mr(ibmr->mr); -} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c deleted file mode 100644 index a66d1794b..000000000 --- a/net/rds/iw_recv.c +++ /dev/null @@ -1,904 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <rdma/rdma_cm.h> - -#include "rds.h" -#include "iw.h" - -static struct kmem_cache *rds_iw_incoming_slab; -static struct kmem_cache *rds_iw_frag_slab; -static atomic_t rds_iw_allocation = ATOMIC_INIT(0); - -static void rds_iw_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - -static void rds_iw_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page); - kmem_cache_free(rds_iw_frag_slab, frag); -} - -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, - struct rds_iw_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; -} - -void rds_iw_recv_init_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_recv_work *recv; - u32 i; - - for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { - struct ib_sge *sge; - - recv->r_iwinc = NULL; - recv->r_frag = NULL; - - recv->r_wr.next = NULL; - recv->r_wr.wr_id = i; - recv->r_wr.sg_list = recv->r_sge; - recv->r_wr.num_sge = RDS_IW_RECV_SGE; - - sge = rds_iw_data_sge(ic, recv->r_sge); - sge->addr = 0; - sge->length = RDS_FRAG_SIZE; - sge->lkey = 0; - - sge = rds_iw_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = 0; - } -} - -static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, - struct rds_iw_recv_work *recv) -{ - if (recv->r_iwinc) { - rds_inc_put(&recv->r_iwinc->ii_inc); - recv->r_iwinc = NULL; - } - if (recv->r_frag) { - rds_iw_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_iw_frag_drop_page(recv->r_frag); - rds_iw_frag_free(recv->r_frag); - recv->r_frag = NULL; - } -} - -void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) -{ - u32 i; - - for (i = 0; i < ic->i_recv_ring.w_nr; i++) - rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); - - if (ic->i_frag.f_page) - rds_iw_frag_drop_page(&ic->i_frag); -} - -static int rds_iw_recv_refill_one(struct rds_connection *conn, - struct rds_iw_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; - struct ib_sge *sge; - int ret = -ENOMEM; - - if (!recv->r_iwinc) { - if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { - rds_iw_stats_inc(s_iw_rx_alloc_limit); - goto out; - } - recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, - kptr_gfp); - if (!recv->r_iwinc) { - atomic_dec(&rds_iw_allocation); - goto out; - } - INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); - rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); - } - - if (!recv->r_frag) { - recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (!recv->r_frag) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; - } - - if (!ic->i_frag.f_page) { - ic->i_frag.f_page = alloc_page(page_gfp); - if (!ic->i_frag.f_page) - goto out; - ic->i_frag.f_offset = 0; - } - - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) - goto out; - - /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. - */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; - - sge = rds_iw_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; - - sge = rds_iw_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); - sge->length = sizeof(struct rds_header); - - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } - - ret = 0; -out: - return ret; -} - -/* - * This tries to allocate and post unused work requests after making sure that - * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. - * - * -1 is returned if posting fails due to temporary resource exhaustion. - */ -int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_recv_work *recv; - struct ib_recv_wr *failed_wr; - unsigned int posted = 0; - int ret = 0; - u32 pos; - - while ((prefill || rds_conn_up(conn)) && - rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { - if (pos >= ic->i_recv_ring.w_nr) { - printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", - pos); - ret = -EINVAL; - break; - } - - recv = &ic->i_recvs[pos]; - ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); - if (ret) { - ret = -1; - break; - } - - /* XXX when can this fail? */ - ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); - rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, - recv->r_iwinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); - if (ret) { - rds_iw_conn_error(conn, "recv post on " - "%pI4 returned %d, disconnecting and " - "reconnecting\n", &conn->c_faddr, - ret); - ret = -1; - break; - } - - posted++; - } - - /* We're doing flow control - update the window. */ - if (ic->i_flowctl && posted) - rds_iw_advertise_credits(conn, posted); - - if (ret) - rds_iw_ring_unalloc(&ic->i_recv_ring, 1); - return ret; -} - -static void rds_iw_inc_purge(struct rds_incoming *inc) -{ - struct rds_iw_incoming *iwinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); - - list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_iw_frag_drop_page(frag); - rds_iw_frag_free(frag); - } -} - -void rds_iw_inc_free(struct rds_incoming *inc) -{ - struct rds_iw_incoming *iwinc; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - - rds_iw_inc_purge(inc); - rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); - BUG_ON(!list_empty(&iwinc->ii_frags)); - kmem_cache_free(rds_iw_incoming_slab, iwinc); - atomic_dec(&rds_iw_allocation); - BUG_ON(atomic_read(&rds_iw_allocation) < 0); -} - -int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) -{ - struct rds_iw_incoming *iwinc; - struct rds_page_frag *frag; - unsigned long to_copy; - unsigned long frag_off = 0; - int copied = 0; - int ret; - u32 len; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); - len = be32_to_cpu(inc->i_hdr.h_len); - - while (iov_iter_count(to) && copied < len) { - if (frag_off == RDS_FRAG_SIZE) { - frag = list_entry(frag->f_item.next, - struct rds_page_frag, f_item); - frag_off = 0; - } - to_copy = min_t(unsigned long, iov_iter_count(to), - RDS_FRAG_SIZE - frag_off); - to_copy = min_t(unsigned long, to_copy, len - copied); - - /* XXX needs + offset for multiple recvs per page */ - rds_stats_add(s_copy_to_user, to_copy); - ret = copy_page_to_iter(frag->f_page, - frag->f_offset + frag_off, - to_copy, - to); - if (ret != to_copy) - return -EFAULT; - - frag_off += to_copy; - copied += to_copy; - } - - return copied; -} - -/* ic starts out kzalloc()ed */ -void rds_iw_recv_init_ack(struct rds_iw_connection *ic) -{ - struct ib_send_wr *wr = &ic->i_ack_wr; - struct ib_sge *sge = &ic->i_ack_sge; - - sge->addr = ic->i_ack_dma; - sge->length = sizeof(struct rds_header); - sge->lkey = rds_iw_local_dma_lkey(ic); - - wr->sg_list = sge; - wr->num_sge = 1; - wr->opcode = IB_WR_SEND; - wr->wr_id = RDS_IW_ACK_WR_ID; - wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; -} - -/* - * You'd think that with reliable IB connections you wouldn't need to ack - * messages that have been received. The problem is that IB hardware generates - * an ack message before it has DMAed the message into memory. This creates a - * potential message loss if the HCA is disabled for any reason between when it - * sends the ack and before the message is DMAed and processed. This is only a - * potential issue if another HCA is available for fail-over. - * - * When the remote host receives our ack they'll free the sent message from - * their send queue. To decrease the latency of this we always send an ack - * immediately after we've received messages. - * - * For simplicity, we only have one ack in flight at a time. This puts - * pressure on senders to have deep enough send queues to absorb the latency of - * a single ack frame being in flight. This might not be good enough. - * - * This is implemented by have a long-lived send_wr and sge which point to a - * statically allocated ack frame. This ack wr does not fall under the ring - * accounting that the tx and rx wrs do. The QP attribute specifically makes - * room for it beyond the ring size. Send completion notices its special - * wr_id and avoids working with the ring in that case. - */ -#ifndef KERNEL_HAS_ATOMIC64 -static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, - int ack_required) -{ - unsigned long flags; - - spin_lock_irqsave(&ic->i_ack_lock, flags); - ic->i_ack_next = seq; - if (ack_required) - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - spin_unlock_irqrestore(&ic->i_ack_lock, flags); -} - -static u64 rds_iw_get_ack(struct rds_iw_connection *ic) -{ - unsigned long flags; - u64 seq; - - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - - spin_lock_irqsave(&ic->i_ack_lock, flags); - seq = ic->i_ack_next; - spin_unlock_irqrestore(&ic->i_ack_lock, flags); - - return seq; -} -#else -static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, - int ack_required) -{ - atomic64_set(&ic->i_ack_next, seq); - if (ack_required) { - smp_mb__before_atomic(); - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - } -} - -static u64 rds_iw_get_ack(struct rds_iw_connection *ic) -{ - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_atomic(); - - return atomic64_read(&ic->i_ack_next); -} -#endif - - -static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) -{ - struct rds_header *hdr = ic->i_ack; - struct ib_send_wr *failed_wr; - u64 seq; - int ret; - - seq = rds_iw_get_ack(ic); - - rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); - rds_message_populate_header(hdr, 0, 0, 0); - hdr->h_ack = cpu_to_be64(seq); - hdr->h_credit = adv_credits; - rds_message_make_checksum(hdr); - ic->i_ack_queued = jiffies; - - ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); - if (unlikely(ret)) { - /* Failed to send. Release the WR, and - * force another ACK. - */ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - - rds_iw_stats_inc(s_iw_ack_send_failure); - - rds_iw_conn_error(ic->conn, "sending ack failed\n"); - } else - rds_iw_stats_inc(s_iw_ack_sent); -} - -/* - * There are 3 ways of getting acknowledgements to the peer: - * 1. We call rds_iw_attempt_ack from the recv completion handler - * to send an ACK-only frame. - * However, there can be only one such frame in the send queue - * at any time, so we may have to postpone it. - * 2. When another (data) packet is transmitted while there's - * an ACK in the queue, we piggyback the ACK sequence number - * on the data packet. - * 3. If the ACK WR is done sending, we get called from the - * send queue completion handler, and check whether there's - * another ACK pending (postponed because the WR was on the - * queue). If so, we transmit it. - * - * We maintain 2 variables: - * - i_ack_flags, which keeps track of whether the ACK WR - * is currently in the send queue or not (IB_ACK_IN_FLIGHT) - * - i_ack_next, which is the last sequence number we received - * - * Potentially, send queue and receive queue handlers can run concurrently. - * It would be nice to not have to use a spinlock to synchronize things, - * but the one problem that rules this out is that 64bit updates are - * not atomic on all platforms. Things would be a lot simpler if - * we had atomic64 or maybe cmpxchg64 everywhere. - * - * Reconnecting complicates this picture just slightly. When we - * reconnect, we may be seeing duplicate packets. The peer - * is retransmitting them, because it hasn't seen an ACK for - * them. It is important that we ACK these. - * - * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with - * this flag set *MUST* be acknowledged immediately. - */ - -/* - * When we get here, we're called from the recv queue handler. - * Check whether we ought to transmit an ACK. - */ -void rds_iw_attempt_ack(struct rds_iw_connection *ic) -{ - unsigned int adv_credits; - - if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) - return; - - if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { - rds_iw_stats_inc(s_iw_ack_send_delayed); - return; - } - - /* Can we get a send credit? */ - if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { - rds_iw_stats_inc(s_iw_tx_throttle); - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - return; - } - - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - rds_iw_send_ack(ic, adv_credits); -} - -/* - * We get here from the send completion handler, when the - * adapter tells us the ACK frame was sent. - */ -void rds_iw_ack_send_complete(struct rds_iw_connection *ic) -{ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - rds_iw_attempt_ack(ic); -} - -/* - * This is called by the regular xmit code when it wants to piggyback - * an ACK on an outgoing frame. - */ -u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) -{ - if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) - rds_iw_stats_inc(s_iw_ack_send_piggybacked); - return rds_iw_get_ack(ic); -} - -/* - * It's kind of lame that we're copying from the posted receive pages into - * long-lived bitmaps. We could have posted the bitmaps and rdma written into - * them. But receiving new congestion bitmaps should be a *rare* event, so - * hopefully we won't need to invest that complexity in making it more - * efficient. By copying we can share a simpler core with TCP which has to - * copy. - */ -static void rds_iw_cong_recv(struct rds_connection *conn, - struct rds_iw_incoming *iwinc) -{ - struct rds_cong_map *map; - unsigned int map_off; - unsigned int map_page; - struct rds_page_frag *frag; - unsigned long frag_off; - unsigned long to_copy; - unsigned long copied; - uint64_t uncongested = 0; - void *addr; - - /* catch completely corrupt packets */ - if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) - return; - - map = conn->c_fcong; - map_page = 0; - map_off = 0; - - frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); - frag_off = 0; - - copied = 0; - - while (copied < RDS_CONG_MAP_BYTES) { - uint64_t *src, *dst; - unsigned int k; - - to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); - BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - - addr = kmap_atomic(frag->f_page); - - src = addr + frag_off; - dst = (void *)map->m_page_addrs[map_page] + map_off; - for (k = 0; k < to_copy; k += 8) { - /* Record ports that became uncongested, ie - * bits that changed from 0 to 1. */ - uncongested |= ~(*src) & *dst; - *dst++ = *src++; - } - kunmap_atomic(addr); - - copied += to_copy; - - map_off += to_copy; - if (map_off == PAGE_SIZE) { - map_off = 0; - map_page++; - } - - frag_off += to_copy; - if (frag_off == RDS_FRAG_SIZE) { - frag = list_entry(frag->f_item.next, - struct rds_page_frag, f_item); - frag_off = 0; - } - } - - /* the congestion map is in little endian order */ - uncongested = le64_to_cpu(uncongested); - - rds_cong_map_updated(map, uncongested); -} - -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_iw_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - -static void rds_iw_process_recv(struct rds_connection *conn, - struct rds_iw_recv_work *recv, u32 byte_len, - struct rds_iw_ack_state *state) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_incoming *iwinc = ic->i_iwinc; - struct rds_header *ihdr, *hdr; - - /* XXX shut down the connection if port 0,0 are seen? */ - - rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, - byte_len); - - if (byte_len < sizeof(struct rds_header)) { - rds_iw_conn_error(conn, "incoming message " - "from %pI4 didn't include a " - "header, disconnecting and " - "reconnecting\n", - &conn->c_faddr); - return; - } - byte_len -= sizeof(struct rds_header); - - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; - - /* Validate the checksum. */ - if (!rds_message_verify_checksum(ihdr)) { - rds_iw_conn_error(conn, "incoming message " - "from %pI4 has corrupted header - " - "forcing a reconnect\n", - &conn->c_faddr); - rds_stats_inc(s_recv_drop_bad_checksum); - return; - } - - /* Process the ACK sequence which comes with every packet */ - state->ack_recv = be64_to_cpu(ihdr->h_ack); - state->ack_recv_valid = 1; - - /* Process the credits update if there was one */ - if (ihdr->h_credit) - rds_iw_send_add_credits(conn, ihdr->h_credit); - - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { - /* This is an ACK-only packet. The fact that it gets - * special treatment here is that historically, ACKs - * were rather special beasts. - */ - rds_iw_stats_inc(s_iw_ack_received); - - /* - * Usually the frags make their way on to incs and are then freed as - * the inc is freed. We don't go that route, so we have to drop the - * page ref ourselves. We can't just leave the page on the recv - * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. - * - * FIXME: Fold this into the code path below. - */ - rds_iw_frag_drop_page(recv->r_frag); - return; - } - - /* - * If we don't already have an inc on the connection then this - * fragment has a header and starts a message.. copy its header - * into the inc and save the inc so we can hang upcoming fragments - * off its list. - */ - if (!iwinc) { - iwinc = recv->r_iwinc; - recv->r_iwinc = NULL; - ic->i_iwinc = iwinc; - - hdr = &iwinc->ii_inc.i_hdr; - memcpy(hdr, ihdr, sizeof(*hdr)); - ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); - - rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, - ic->i_recv_data_rem, hdr->h_flags); - } else { - hdr = &iwinc->ii_inc.i_hdr; - /* We can't just use memcmp here; fragments of a - * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence || - hdr->h_len != ihdr->h_len || - hdr->h_sport != ihdr->h_sport || - hdr->h_dport != ihdr->h_dport) { - rds_iw_conn_error(conn, - "fragment header mismatch; forcing reconnect\n"); - return; - } - } - - list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); - recv->r_frag = NULL; - - if (ic->i_recv_data_rem > RDS_FRAG_SIZE) - ic->i_recv_data_rem -= RDS_FRAG_SIZE; - else { - ic->i_recv_data_rem = 0; - ic->i_iwinc = NULL; - - if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) - rds_iw_cong_recv(conn, iwinc); - else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &iwinc->ii_inc, GFP_ATOMIC); - state->ack_next = be64_to_cpu(hdr->h_sequence); - state->ack_next_valid = 1; - } - - /* Evaluate the ACK_REQUIRED flag *after* we received - * the complete frame, and after bumping the next_rx - * sequence. */ - if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { - rds_stats_inc(s_recv_ack_required); - state->ack_required = 1; - } - - rds_inc_put(&iwinc->ii_inc); - } -} - -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_iw_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p cq %p\n", conn, cq); - - rds_iw_stats_inc(s_iw_rx_cq_call); - - tasklet_schedule(&ic->i_recv_tasklet); -} - -static inline void rds_poll_cq(struct rds_iw_connection *ic, - struct rds_iw_ack_state *state) -{ - struct rds_connection *conn = ic->conn; - struct ib_wc wc; - struct rds_iw_recv_work *recv; - - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_iw_stats_inc(s_iw_rx_cq_event); - - recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; - - rds_iw_recv_unmap_page(ic, recv); - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { - /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_iw_process_recv(conn, recv, wc.byte_len, state); - } else { - rds_iw_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } - } - - rds_iw_ring_free(&ic->i_recv_ring, 1); - } -} - -void rds_iw_recv_tasklet_fn(unsigned long data) -{ - struct rds_iw_connection *ic = (struct rds_iw_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_iw_ack_state state = { 0, }; - - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); - - if (state.ack_next_valid) - rds_iw_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; - } - if (rds_conn_up(conn)) - rds_iw_attempt_ack(ic); - - /* If we ever end up with a really empty receive ring, we're - * in deep trouble, as the sender will definitely see RNR - * timeouts. */ - if (rds_iw_ring_empty(&ic->i_recv_ring)) - rds_iw_stats_inc(s_iw_rx_ring_empty); - - /* - * If the ring is running low, then schedule the thread to refill. - */ - if (rds_iw_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); -} - -int rds_iw_recv(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - int ret = 0; - - rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) - ret = -ENOMEM; - else - rds_iw_stats_inc(s_iw_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - - if (rds_conn_up(conn)) - rds_iw_attempt_ack(ic); - - return ret; -} - -int rds_iw_recv_init(void) -{ - struct sysinfo si; - int ret = -ENOMEM; - - /* Default to 30% of all available RAM for recv memory */ - si_meminfo(&si); - rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; - - rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", - sizeof(struct rds_iw_incoming), - 0, 0, NULL); - if (!rds_iw_incoming_slab) - goto out; - - rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", - sizeof(struct rds_page_frag), - 0, 0, NULL); - if (!rds_iw_frag_slab) - kmem_cache_destroy(rds_iw_incoming_slab); - else - ret = 0; -out: - return ret; -} - -void rds_iw_recv_exit(void) -{ - kmem_cache_destroy(rds_iw_incoming_slab); - kmem_cache_destroy(rds_iw_frag_slab); -} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c deleted file mode 100644 index da8e3b63f..000000000 --- a/net/rds/iw_ring.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> - -#include "rds.h" -#include "iw.h" - -/* - * Locking for IB rings. - * We assume that allocation is always protected by a mutex - * in the caller (this is a valid assumption for the current - * implementation). - * - * Freeing always happens in an interrupt, and hence only - * races with allocations, but not with other free()s. - * - * The interaction between allocation and freeing is that - * the alloc code has to determine the number of free entries. - * To this end, we maintain two counters; an allocation counter - * and a free counter. Both are allowed to run freely, and wrap - * around. - * The number of used entries is always (alloc_ctr - free_ctr) % NR. - * - * The current implementation makes free_ctr atomic. When the - * caller finds an allocation fails, it should set an "alloc fail" - * bit and retry the allocation. The "alloc fail" bit essentially tells - * the CQ completion handlers to wake it up after freeing some - * more entries. - */ - -/* - * This only happens on shutdown. - */ -DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); - -void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) -{ - memset(ring, 0, sizeof(*ring)); - ring->w_nr = nr; - rdsdebug("ring %p nr %u\n", ring, ring->w_nr); -} - -static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) -{ - u32 diff; - - /* This assumes that atomic_t has at least as many bits as u32 */ - diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); - BUG_ON(diff > ring->w_nr); - - return diff; -} - -void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) -{ - /* We only ever get called from the connection setup code, - * prior to creating the QP. */ - BUG_ON(__rds_iw_ring_used(ring)); - ring->w_nr = nr; -} - -static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_used(ring) == 0; -} - -u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) -{ - u32 ret = 0, avail; - - avail = ring->w_nr - __rds_iw_ring_used(ring); - - rdsdebug("ring %p val %u next %u free %u\n", ring, val, - ring->w_alloc_ptr, avail); - - if (val && avail) { - ret = min(val, avail); - *pos = ring->w_alloc_ptr; - - ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; - ring->w_alloc_ctr += ret; - } - - return ret; -} - -void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) -{ - ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; - atomic_add(val, &ring->w_free_ctr); - - if (__rds_iw_ring_empty(ring) && - waitqueue_active(&rds_iw_ring_empty_wait)) - wake_up(&rds_iw_ring_empty_wait); -} - -void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) -{ - ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; - ring->w_alloc_ctr -= val; -} - -int rds_iw_ring_empty(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_empty(ring); -} - -int rds_iw_ring_low(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); -} - - -/* - * returns the oldest alloced ring entry. This will be the next one - * freed. This can't be called if there are none allocated. - */ -u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) -{ - return ring->w_free_ptr; -} - -/* - * returns the number of completed work requests. - */ - -u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) -{ - u32 ret; - - if (oldest <= (unsigned long long)wr_id) - ret = (unsigned long long)wr_id - oldest + 1; - else - ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; - - rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, - wr_id, oldest); - return ret; -} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c deleted file mode 100644 index e20bd503f..000000000 --- a/net/rds/iw_send.c +++ /dev/null @@ -1,981 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/device.h> -#include <linux/dmapool.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - -static void rds_iw_send_rdma_complete(struct rds_message *rm, - int wc_status) -{ - int notify_status; - - switch (wc_status) { - case IB_WC_WR_FLUSH_ERR: - return; - - case IB_WC_SUCCESS: - notify_status = RDS_RDMA_SUCCESS; - break; - - case IB_WC_REM_ACCESS_ERR: - notify_status = RDS_RDMA_REMOTE_ERROR; - break; - - default: - notify_status = RDS_RDMA_OTHER_ERROR; - break; - } - rds_rdma_send_complete(rm, notify_status); -} - -static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rm_rdma_op *op) -{ - if (op->op_mapped) { - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->op_mapped = 0; - } -} - -static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, - struct rds_iw_send_work *send, - int wc_status) -{ - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->data.op_sg, rm->data.op_nents, - DMA_TO_DEVICE); - - if (rm->rdma.op_active) { - rds_iw_send_unmap_rdma(ic, &rm->rdma); - - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_iw_send_rdma_complete(rm, wc_status); - - if (rm->rdma.op_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); - } - - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - - rds_message_put(rm); - send->s_rm = NULL; -} - -void rds_iw_send_init_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_send_work *send; - u32 i; - - for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - struct ib_sge *sge; - - send->s_rm = NULL; - send->s_op = NULL; - send->s_mapping = NULL; - - send->s_send_wr.next = NULL; - send->s_send_wr.wr_id = i; - send->s_send_wr.sg_list = send->s_sge; - send->s_send_wr.num_sge = 1; - send->s_send_wr.opcode = IB_WR_SEND; - send->s_send_wr.send_flags = 0; - send->s_send_wr.ex.imm_data = 0; - - sge = rds_iw_data_sge(ic, send->s_sge); - sge->lkey = 0; - - sge = rds_iw_header_sge(ic, send->s_sge); - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = 0; - - send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG, - fastreg_message_size); - if (IS_ERR(send->s_mr)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n"); - break; - } - } -} - -void rds_iw_send_clear_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_send_work *send; - u32 i; - - for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - BUG_ON(!send->s_mr); - ib_dereg_mr(send->s_mr); - if (send->s_send_wr.opcode == 0xdead) - continue; - if (send->s_rm) - rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_iw_send_unmap_rdma(ic, send->s_op); - } -} - -/* - * The _oldest/_free ring operations here race cleanly with the alloc/unalloc - * operations performed in the send path. As the sender allocs and potentially - * unallocs the next free entry in the ring it doesn't alter which is - * the next to be freed, which is what this is concerned with. - */ -void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_wc wc; - struct rds_iw_send_work *send; - u32 completed; - u32 oldest; - u32 i; - int ret; - - rdsdebug("cq %p conn %p\n", cq, conn); - rds_iw_stats_inc(s_iw_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_iw_stats_inc(s_iw_tx_cq_event); - - if (wc.status != IB_WC_SUCCESS) { - printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); - break; - } - - if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { - ic->i_fastreg_posted = 0; - continue; - } - - if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) { - ic->i_fastreg_posted = 1; - continue; - } - - if (wc.wr_id == RDS_IW_ACK_WR_ID) { - if (time_after(jiffies, ic->i_ack_queued + HZ/2)) - rds_iw_stats_inc(s_iw_tx_stalled); - rds_iw_ack_send_complete(ic); - continue; - } - - oldest = rds_iw_ring_oldest(&ic->i_send_ring); - - completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); - - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_send_wr.opcode) { - case IB_WR_SEND: - if (send->s_rm) - rds_iw_send_unmap_rm(ic, send, wc.status); - break; - case IB_WR_REG_MR: - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - case IB_WR_RDMA_READ_WITH_INV: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ - break; - default: - printk_ratelimited(KERN_NOTICE - "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_send_wr.opcode); - break; - } - - send->s_send_wr.opcode = 0xdead; - send->s_send_wr.num_sge = 1; - if (time_after(jiffies, send->s_queued + HZ/2)) - rds_iw_stats_inc(s_iw_tx_stalled); - - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) - rds_iw_send_rdma_complete(rm, wc.status); - } - - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } - - rds_iw_ring_free(&ic->i_send_ring, completed); - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_iw_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); - } - } -} - -/* - * This is the main function for allocating credits when sending - * messages. - * - * Conceptually, we have two counters: - * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the receiver's queue. For - * each SEND WR we post, we decrement this by one. - * - * - posted credits: this tells us how many WRs we recently - * posted to the receive queue. This value is transferred - * to the peer as a "credit update" in a RDS header field. - * Every time we transmit credits to the peer, we subtract - * the amount of transferred credits from this counter. - * - * It is essential that we avoid situations where both sides have - * exhausted their send credits, and are unable to send new credits - * to the peer. We achieve this by requiring that we send at least - * one credit update to the peer before exhausting our credits. - * When new credits arrive, we subtract one credit that is withheld - * until we've posted new buffers and are ready to transmit these - * credits (see rds_iw_send_add_credits below). - * - * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. - * However, the ACK sending code is independent and can race with - * message SENDs. - * - * In the send path, we need to update the counters for send credits - * and the counter of posted buffers atomically - when we use the - * last available credit, we cannot allow another thread to race us - * and grab the posted credits counter. Hence, we have to use a - * spinlock to protect the credit counter, or use atomics. - * - * Spinlocks shared between the send and the receive path are bad, - * because they create unnecessary delays. An early implementation - * using a spinlock showed a 5% degradation in throughput at some - * loads. - * - * This implementation avoids spinlocks completely, putting both - * counters into a single atomic, and updating that atomic using - * atomic_add (in the receive path, when receiving fresh credits), - * and using atomic_cmpxchg when updating the two counters. - */ -int rds_iw_send_grab_credits(struct rds_iw_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted, int max_posted) -{ - unsigned int avail, posted, got = 0, advertise; - long oldval, newval; - - *adv_credits = 0; - if (!ic->i_flowctl) - return wanted; - -try_again: - advertise = 0; - oldval = newval = atomic_read(&ic->i_credits); - posted = IB_GET_POST_CREDITS(oldval); - avail = IB_GET_SEND_CREDITS(oldval); - - rdsdebug("wanted=%u credits=%u posted=%u\n", - wanted, avail, posted); - - /* The last credit must be used to send a credit update. */ - if (avail && !posted) - avail--; - - if (avail < wanted) { - struct rds_connection *conn = ic->i_cm_id->context; - - /* Oops, there aren't that many credits left! */ - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - got = avail; - } else { - /* Sometimes you get what you want, lalala. */ - got = wanted; - } - newval -= IB_SET_SEND_CREDITS(got); - - /* - * If need_posted is non-zero, then the caller wants - * the posted regardless of whether any send credits are - * available. - */ - if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, max_posted); - newval -= IB_SET_POST_CREDITS(advertise); - } - - /* Finally bill everything */ - if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) - goto try_again; - - *adv_credits = advertise; - return got; -} - -void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (credits == 0) - return; - - rdsdebug("credits=%u current=%u%s\n", - credits, - IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), - test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); - - atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); - - rds_iw_stats_inc(s_iw_rx_credit_updates); -} - -void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (posted == 0) - return; - - atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); - - /* Decide whether to send an update to the peer now. - * If we would send a credit update for every single buffer we - * post, we would end up with an ACK storm (ACK arrives, - * consumes buffer, we refill the ring, send ACK to remote - * advertising the newly posted buffer... ad inf) - * - * Performance pretty much depends on how often we send - * credit updates - too frequent updates mean lots of ACKs. - * Too infrequent updates, and the peer will run out of - * credits and has to throttle. - * For the time being, 16 seems to be a good compromise. - */ - if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); -} - -static inline void -rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, - struct rds_iw_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) -{ - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_send_wr.send_flags = send_flags; - send->s_send_wr.opcode = IB_WR_SEND; - send->s_send_wr.num_sge = 2; - send->s_send_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - if (length != 0) { - sge = rds_iw_data_sge(ic, send->s_sge); - sge->addr = buffer; - sge->length = length; - sge->lkey = rds_iw_local_dma_lkey(ic); - - sge = rds_iw_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_send_wr.num_sge = 1; - sge = &send->s_sge[0]; - } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = rds_iw_local_dma_lkey(ic); -} - -/* - * This can be called multiple times for a given message. The first time - * we see a message we map its scatterlist into the IB device so that - * we can provide that mapped address to the IB scatter gather entries - * in the IB work requests. We translate the scatterlist into a series - * of work requests that fragment the message. These work requests complete - * in order so we pass ownership of the message to the completion handler - * once we send the final fragment. - * - * The RDS core uses the c_send_lock to only enter this function once - * per connection. This makes sure that the tx ring alloc/unalloc pairs - * don't get out of sync and confuse the ring. - */ -int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_device *dev = ic->i_cm_id->device; - struct rds_iw_send_work *send = NULL; - struct rds_iw_send_work *first; - struct rds_iw_send_work *prev; - struct ib_send_wr *failed_wr; - struct scatterlist *scat; - u32 pos; - u32 i; - u32 work_alloc; - u32 credit_alloc; - u32 posted; - u32 adv_credits = 0; - int send_flags = 0; - int sent; - int ret; - int flow_controlled = 0; - - BUG_ON(off % RDS_FRAG_SIZE); - BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); - - /* Fastreg support */ - if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { - ret = -EAGAIN; - goto out; - } - - /* FIXME we may overallocate here */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) - i = 1; - else - i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); - - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - - credit_alloc = work_alloc; - if (ic->i_flowctl) { - credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); - adv_credits += posted; - if (credit_alloc < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); - work_alloc = credit_alloc; - flow_controlled++; - } - if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_iw_stats_inc(s_iw_tx_throttle); - ret = -ENOMEM; - goto out; - } - } - - /* map the message the first time we see it */ - if (!ic->i_rm) { - /* - printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ - if (rm->data.op_nents) { - rm->data.op_count = ib_dma_map_sg(dev, - rm->data.op_sg, - rm->data.op_nents, - DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); - if (rm->data.op_count == 0) { - rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - ret = -ENOMEM; /* XXX ? */ - goto out; - } - } else { - rm->data.op_count = 0; - } - - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - rds_message_addref(rm); - rm->data.op_dmasg = 0; - rm->data.op_dmaoff = 0; - ic->i_rm = rm; - - /* Finalize the header */ - if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) - rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; - if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) - rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; - - /* If it has a RDMA op, tell the peer we did it. This is - * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.op_active) { - struct rds_ext_header_rdma ext_hdr; - - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); - rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); - } - if (rm->m_rdma_cookie) { - rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, - rds_rdma_cookie_key(rm->m_rdma_cookie), - rds_rdma_cookie_offset(rm->m_rdma_cookie)); - } - - /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so - * we should not do this unless we have a chance of at least - * sticking the header into the send ring. Which is why we - * should call rds_iw_ring_alloc first. */ - rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); - rds_message_make_checksum(&rm->m_inc.i_hdr); - - /* - * Update adv_credits since we reset the ACK_REQUIRED bit. - */ - rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); - adv_credits += posted; - BUG_ON(adv_credits > 255); - } - - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->data.op_sg[rm->data.op_dmasg]; - sent = 0; - i = 0; - - /* Sometimes you want to put a fence between an RDMA - * READ and the following SEND. - * We could either do this all the time - * or when requested by the user. Right now, we let - * the application choose. - */ - if (rm->rdma.op_active && rm->rdma.op_fence) - send_flags = IB_SEND_FENCE; - - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ - - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } - - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { - unsigned int len; - - send = &ic->i_sends[pos]; - - len = min(RDS_FRAG_SIZE, - ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); - rds_iw_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len, - send_flags); - - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - - /* - * Always signal the last one if we're stopping due to flow control. - */ - if (flow_controlled && i == (work_alloc-1)) - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - - rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next); - - sent += len; - rm->data.op_dmaoff += len; - if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { - scat++; - rm->data.op_dmaoff = 0; - rm->data.op_dmasg++; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } - if (adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - /* add credit and redo the header checksum */ - hdr->h_credit = adv_credits; - rds_message_make_checksum(hdr); - adv_credits = 0; - rds_iw_stats_inc(s_iw_tx_credit_updates); - } - - if (prev) - prev->s_send_wr.next = &send->s_send_wr; - prev = send; - - pos = (pos + 1) % ic->i_send_ring.w_nr; - } - - /* Account the RDS header in the number of bytes we sent, but just once. - * The caller has no concept of fragmentation. */ - if (hdr_off == 0) - sent += sizeof(struct rds_header); - - /* if we finished the message then send completion owns it */ - if (scat == &rm->data.op_sg[rm->data.op_count]) { - prev->s_rm = ic->i_rm; - prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - ic->i_rm = NULL; - } - - if (i < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - if (ic->i_flowctl && i < credit_alloc) - rds_iw_send_add_credits(conn, credit_alloc - i); - - /* XXX need to worry about failed_wr and partial sends. */ - failed_wr = &first->s_send_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr); - rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_send_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_send_wr); - if (ret) { - printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; - } - goto out; - } - - ret = sent; -out: - BUG_ON(adv_credits); - return ret; -} - -static int rds_iw_build_send_reg(struct rds_iw_send_work *send, - struct scatterlist *sg, - int sg_nents) -{ - int n; - - n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE); - if (unlikely(n != sg_nents)) - return n < 0 ? n : -EINVAL; - - send->s_reg_wr.wr.opcode = IB_WR_REG_MR; - send->s_reg_wr.wr.wr_id = 0; - send->s_reg_wr.wr.num_sge = 0; - send->s_reg_wr.mr = send->s_mr; - send->s_reg_wr.key = send->s_mr->rkey; - send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE; - - ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); - - return 0; -} - -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_send_work *send = NULL; - struct rds_iw_send_work *first; - struct rds_iw_send_work *prev; - struct ib_send_wr *failed_wr; - struct rds_iw_device *rds_iwdev; - struct scatterlist *scat; - unsigned long len; - u64 remote_addr = op->op_remote_addr; - u32 pos, fr_pos; - u32 work_alloc; - u32 i; - u32 j; - int sent; - int ret; - int num_sge; - int sg_nents; - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - - /* map the message the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; - } - - op->op_mapped = 1; - } - - if (!op->op_write) { - /* Alloc space on the send queue for the fastreg */ - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); - if (work_alloc != 1) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - } - - /* - * Instead of knowing how to return a partial rdma read/write we insist that there - * be enough work requests to send the entire message. - */ - i = ceil(op->op_count, rds_iwdev->max_sge); - - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc != i) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - - send = &ic->i_sends[pos]; - if (!op->op_write) { - first = prev = &ic->i_sends[fr_pos]; - } else { - first = send; - prev = NULL; - } - scat = &op->op_sg[0]; - sent = 0; - num_sge = op->op_count; - sg_nents = 0; - - for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { - send->s_rdma_wr.wr.send_flags = 0; - send->s_queued = jiffies; - - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; - } - - /* To avoid the need to have the plumbing to invalidate the fastreg_mr used - * for local access after RDS is finished with it, using - * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. - */ - if (op->op_write) - send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; - else - send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - - send->s_rdma_wr.remote_addr = remote_addr; - send->s_rdma_wr.rkey = op->op_rkey; - send->s_op = op; - - if (num_sge > rds_iwdev->max_sge) { - send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge; - num_sge -= rds_iwdev->max_sge; - } else - send->s_rdma_wr.wr.num_sge = num_sge; - - send->s_rdma_wr.wr.next = NULL; - - if (prev) - prev->s_send_wr.next = &send->s_rdma_wr.wr; - - for (j = 0; j < send->s_rdma_wr.wr.num_sge && - scat != &op->op_sg[op->op_count]; j++) { - len = ib_sg_dma_len(ic->i_cm_id->device, scat); - - if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) - sg_nents++; - else { - send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); - send->s_sge[j].length = len; - send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); - } - - sent += len; - rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); - remote_addr += len; - - scat++; - } - - if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) { - send->s_rdma_wr.wr.num_sge = 1; - send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; - send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; - send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; - } - - rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_rdma_wr, - send->s_rdma_wr.wr.num_sge, - send->s_rdma_wr.wr.next); - - prev = send; - if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) - send = ic->i_sends; - } - - /* if we finished the message then send completion owns it */ - if (scat == &op->op_sg[op->op_count]) - first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; - - if (i < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - - /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not - * recommended. Putting the lkey on the wire is a security hole, as it can - * allow for memory access to all of memory on the remote system. Some - * adapters do not allow using the lkey for this at all. To bypass this use a - * fastreg_mr (or possibly a dma_mr) - */ - if (!op->op_write) { - ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos], - &op->op_sg[0], sg_nents); - if (ret) { - printk(KERN_WARNING "RDS/IW: failed to reg send mem\n"); - goto out; - } - work_alloc++; - } - - failed_wr = &first->s_rdma_wr.wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); - rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_rdma_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_rdma_wr.wr); - if (ret) { - printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - goto out; - } - -out: - return ret; -} - -void rds_iw_xmit_complete(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* We may have a pending ACK or window update we were unable - * to send previously (due to flow control). Try again. */ - rds_iw_attempt_ack(ic); -} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c deleted file mode 100644 index 5fe67f6a1..000000000 --- a/net/rds/iw_stats.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/percpu.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> - -#include "rds.h" -#include "iw.h" - -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); - -static const char *const rds_iw_stat_names[] = { - "iw_connect_raced", - "iw_listen_closed_stale", - "iw_tx_cq_call", - "iw_tx_cq_event", - "iw_tx_ring_full", - "iw_tx_throttle", - "iw_tx_sg_mapping_failure", - "iw_tx_stalled", - "iw_tx_credit_updates", - "iw_rx_cq_call", - "iw_rx_cq_event", - "iw_rx_ring_empty", - "iw_rx_refill_from_cq", - "iw_rx_refill_from_thread", - "iw_rx_alloc_limit", - "iw_rx_credit_updates", - "iw_ack_sent", - "iw_ack_send_failure", - "iw_ack_send_delayed", - "iw_ack_send_piggybacked", - "iw_ack_received", - "iw_rdma_mr_alloc", - "iw_rdma_mr_free", - "iw_rdma_mr_used", - "iw_rdma_mr_pool_flush", - "iw_rdma_mr_pool_wait", - "iw_rdma_mr_pool_depleted", -}; - -unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, - unsigned int avail) -{ - struct rds_iw_statistics stats = {0, }; - uint64_t *src; - uint64_t *sum; - size_t i; - int cpu; - - if (avail < ARRAY_SIZE(rds_iw_stat_names)) - goto out; - - for_each_online_cpu(cpu) { - src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); - sum = (uint64_t *)&stats; - for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) - *(sum++) += *(src++); - } - - rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, - ARRAY_SIZE(rds_iw_stat_names)); -out: - return ARRAY_SIZE(rds_iw_stat_names); -} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c deleted file mode 100644 index 139239d2c..000000000 --- a/net/rds/iw_sysctl.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> - -#include "iw.h" - -static struct ctl_table_header *rds_iw_sysctl_hdr; - -unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; -unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; -unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; -static unsigned long rds_iw_sysctl_max_wr_min = 1; -/* hardware will fail CQ creation long before this */ -static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; - -unsigned long rds_iw_sysctl_max_unsig_wrs = 16; -static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; -static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; - -unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; - -unsigned int rds_iw_sysctl_flow_control = 1; - -static struct ctl_table rds_iw_sysctl_table[] = { - { - .procname = "max_send_wr", - .data = &rds_iw_sysctl_max_send_wr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_wr_min, - .extra2 = &rds_iw_sysctl_max_wr_max, - }, - { - .procname = "max_recv_wr", - .data = &rds_iw_sysctl_max_recv_wr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_wr_min, - .extra2 = &rds_iw_sysctl_max_wr_max, - }, - { - .procname = "max_unsignaled_wr", - .data = &rds_iw_sysctl_max_unsig_wrs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_unsig_wr_min, - .extra2 = &rds_iw_sysctl_max_unsig_wr_max, - }, - { - .procname = "max_unsignaled_bytes", - .data = &rds_iw_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, - .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, - }, - { - .procname = "max_recv_allocation", - .data = &rds_iw_sysctl_max_recv_allocation, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "flow_control", - .data = &rds_iw_sysctl_flow_control, - .maxlen = sizeof(rds_iw_sysctl_flow_control), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; - -void rds_iw_sysctl_exit(void) -{ - unregister_net_sysctl_table(rds_iw_sysctl_hdr); -} - -int rds_iw_sysctl_init(void) -{ - rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table); - if (!rds_iw_sysctl_hdr) - return -ENOMEM; - return 0; -} diff --git a/net/rds/page.c b/net/rds/page.c index 5a14e6d6a..e2b5a5832 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -42,8 +42,8 @@ struct rds_page_remainder { unsigned long r_offset; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, - rds_page_remainders); +static +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. @@ -135,8 +135,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, if (rem->r_offset != 0) rds_stats_inc(s_page_remainder_hit); - rem->r_offset += bytes; - if (rem->r_offset == PAGE_SIZE) { + rem->r_offset += ALIGN(bytes, 8); + if (rem->r_offset >= PAGE_SIZE) { __free_page(rem->r_page); rem->r_page = NULL; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9c1fed81b..7220bebcf 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, event->event, rdma_event_msg(event->event)); - if (cm_id->device->node_type == RDMA_NODE_RNIC) - trans = &rds_iw_transport; - else + if (cm_id->device->node_type == RDMA_NODE_IB_CA) trans = &rds_ib_transport; /* Prevent shutdown from tearing down the connection @@ -119,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rds_conn_drop(conn); break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + if (conn) { + pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); + } + break; + default: /* things like device disconnect? */ printk(KERN_ERR "RDS: unknown event %u (%s)!\n", @@ -200,10 +206,6 @@ static int rds_rdma_init(void) if (ret) goto out; - ret = rds_iw_init(); - if (ret) - goto err_iw_init; - ret = rds_ib_init(); if (ret) goto err_ib_init; @@ -211,8 +213,6 @@ static int rds_rdma_init(void) goto out; err_ib_init: - rds_iw_exit(); -err_iw_init: rds_rdma_listen_stop(); out: return ret; @@ -224,11 +224,10 @@ static void rds_rdma_exit(void) /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); rds_ib_exit(); - rds_iw_exit(); } module_exit(rds_rdma_exit); MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); -MODULE_DESCRIPTION("RDS: IB/iWARP transport"); +MODULE_DESCRIPTION("RDS: IB transport"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index faba4e382..ff2010e9d 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); void rds_ib_exit(void); -/* from iw.c */ -extern struct rds_transport rds_iw_transport; -int rds_iw_init(void); -void rds_iw_exit(void); - #endif diff --git a/net/rds/rds.h b/net/rds/rds.h index 0e2797bdc..80256b08e 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -222,6 +222,7 @@ struct rds_incoming { __be32 i_saddr; rds_rdma_cookie_t i_rdma_cookie; + struct timeval i_rx_tstamp; }; struct rds_mr { diff --git a/net/rds/recv.c b/net/rds/recv.c index a00462b0d..c0be1ecd1 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -35,6 +35,8 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> +#include <linux/time.h> +#include <linux/rds.h> #include "rds.h" @@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_conn = conn; inc->i_saddr = saddr; inc->i_rdma_cookie = 0; + inc->i_rx_tstamp.tv_sec = 0; + inc->i_rx_tstamp.tv_usec = 0; } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + if (sock_flag(sk, SOCK_RCVTSTAMP)) + do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); @@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) /* * Receive any control messages. */ -static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, + struct rds_sock *rs) { int ret = 0; @@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) return ret; } + if ((inc->i_rx_tstamp.tv_sec != 0) && + sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { + ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(struct timeval), + &inc->i_rx_tstamp); + if (ret) + return ret; + } + return 0; } @@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, msg->msg_flags |= MSG_TRUNC; } - if (rds_cmsg_recv(inc, msg)) { + if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; goto out; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9d6ddbacd..86187dad1 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -37,7 +37,6 @@ #include <net/tcp.h> #include <net/net_namespace.h> #include <net/netns/generic.h> -#include <net/tcp.h> #include "rds.h" #include "tcp.h" @@ -53,7 +52,34 @@ static LIST_HEAD(rds_tcp_conn_list); static struct kmem_cache *rds_tcp_conn_slab; -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) +static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *fpos); + +int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; +int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; + +static struct ctl_table rds_tcp_sysctl_table[] = { +#define RDS_TCP_SNDBUF 0 + { + .procname = "rds_tcp_sndbuf", + /* data is per-net pointer */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rds_tcp_skbuf_handler, + .extra1 = &rds_tcp_min_sndbuf, + }, +#define RDS_TCP_RCVBUF 1 + { + .procname = "rds_tcp_rcvbuf", + /* data is per-net pointer */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rds_tcp_skbuf_handler, + .extra1 = &rds_tcp_min_rcvbuf, + }, + { } +}; /* doing it this way avoids calling tcp_sk() */ void rds_tcp_nonagle(struct socket *sock) @@ -67,15 +93,6 @@ void rds_tcp_nonagle(struct socket *sock) set_fs(oldfs); } -/* All module specific customizations to the RDS-TCP socket should be done in - * rds_tcp_tune() and applied after socket creation. In general these - * customizations should be tunable via module_param() - */ -void rds_tcp_tune(struct socket *sock) -{ - rds_tcp_nonagle(sock); -} - u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) { return tcp_sk(tc->t_sock->sk)->snd_nxt; @@ -110,7 +127,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* * This is the only path that sets tc->t_sock. Send and receive trust that - * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * it is set. The RDS_CONN_UP bit protects those paths from being * called while it isn't set. */ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) @@ -199,6 +216,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) if (!tc) return -ENOMEM; + mutex_init(&tc->t_conn_lock); tc->t_sock = NULL; tc->t_tinc = NULL; tc->t_tinc_hdr_rem = sizeof(struct rds_header); @@ -273,8 +291,34 @@ static int rds_tcp_netid; struct rds_tcp_net { struct socket *rds_tcp_listen_sock; struct work_struct rds_tcp_accept_w; + struct ctl_table_header *rds_tcp_sysctl; + struct ctl_table *ctl_table; + int sndbuf_size; + int rcvbuf_size; }; +/* All module specific customizations to the RDS-TCP socket should be done in + * rds_tcp_tune() and applied after socket creation. + */ +void rds_tcp_tune(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rds_tcp_nonagle(sock); + lock_sock(sk); + if (rtn->sndbuf_size > 0) { + sk->sk_sndbuf = rtn->sndbuf_size; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + if (rtn->rcvbuf_size > 0) { + sk->sk_sndbuf = rtn->rcvbuf_size; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + static void rds_tcp_accept_worker(struct work_struct *work) { struct rds_tcp_net *rtn = container_of(work, @@ -296,20 +340,60 @@ void rds_tcp_accept_work(struct sock *sk) static __net_init int rds_tcp_init_net(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct ctl_table *tbl; + int err = 0; + memset(rtn, 0, sizeof(*rtn)); + + /* {snd, rcv}buf_size default to 0, which implies we let the + * stack pick the value, and permit auto-tuning of buffer size. + */ + if (net == &init_net) { + tbl = rds_tcp_sysctl_table; + } else { + tbl = kmemdup(rds_tcp_sysctl_table, + sizeof(rds_tcp_sysctl_table), GFP_KERNEL); + if (!tbl) { + pr_warn("could not set allocate syctl table\n"); + return -ENOMEM; + } + rtn->ctl_table = tbl; + } + tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size; + tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size; + rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl); + if (!rtn->rds_tcp_sysctl) { + pr_warn("could not register sysctl\n"); + err = -ENOMEM; + goto fail; + } rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); if (!rtn->rds_tcp_listen_sock) { pr_warn("could not set up listen sock\n"); - return -EAFNOSUPPORT; + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; + +fail: + if (net != &init_net) + kfree(tbl); + return err; } static void __net_exit rds_tcp_exit_net(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + if (rtn->rds_tcp_sysctl) + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + + if (net != &init_net && rtn->ctl_table) + kfree(rtn->ctl_table); + /* If rds_tcp_exit_net() is called as a result of netns deletion, * the rds_tcp_kill_sock() device notifier would already have cleaned * up the listen socket, thus there is no work to do in this function. @@ -384,6 +468,45 @@ static struct notifier_block rds_tcp_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; +/* when sysctl is used to modify some kernel socket parameters,this + * function resets the RDS connections in that netns so that we can + * restart with new parameters. The assumption is that such reset + * events are few and far-between. + */ +static void rds_tcp_sysctl_reset(struct net *net) +{ + struct rds_tcp_connection *tc, *_tc; + + spin_lock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + struct net *c_net = read_pnet(&tc->conn->c_net); + + if (net != c_net || !tc->t_sock) + continue; + + rds_conn_drop(tc->conn); /* reconnect with new parameters */ + } + spin_unlock_irq(&rds_tcp_conn_lock); +} + +static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *fpos) +{ + struct net *net = current->nsproxy->net_ns; + int err; + + err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); + if (err < 0) { + pr_warn("Invalid input. Must be >= %d\n", + *(int *)(ctl->extra1)); + return err; + } + if (write) + rds_tcp_sysctl_reset(net); + return 0; +} + static void rds_tcp_exit(void) { rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 64f873c0c..41c228300 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,10 @@ struct rds_tcp_connection { struct list_head t_tcp_node; struct rds_connection *conn; + /* t_conn_lock synchronizes the connection establishment between + * rds_tcp_accept_one and rds_tcp_conn_connect + */ + struct mutex t_conn_lock; struct socket *t_sock; void *t_orig_write_space; void *t_orig_data_ready; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 5cb16875c..49a3fcfed 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -78,7 +78,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct socket *sock = NULL; struct sockaddr_in src, dest; int ret; + struct rds_tcp_connection *tc = conn->c_transport_data; + + mutex_lock(&tc->t_conn_lock); + if (rds_conn_up(conn)) { + mutex_unlock(&tc->t_conn_lock); + return 0; + } ret = sock_create_kern(rds_conn_net(conn), PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) @@ -120,6 +127,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) } out: + mutex_unlock(&tc->t_conn_lock); if (sock) sock_release(sock); return ret; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 0936a4a32..be263cdf2 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -76,7 +76,9 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_connection *conn; int ret; struct inet_sock *inet; - struct rds_tcp_connection *rs_tcp; + struct rds_tcp_connection *rs_tcp = NULL; + int conn_state; + struct sock *nsk; ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, @@ -115,28 +117,44 @@ int rds_tcp_accept_one(struct socket *sock) * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - if (rs_tcp->t_sock && - ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { - struct sock *nsk = new_sock->sk; - - nsk->sk_user_data = NULL; - nsk->sk_prot->disconnect(nsk, 0); - tcp_done(nsk); - new_sock = NULL; - ret = 0; - goto out; - } else if (rs_tcp->t_sock) { - rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); - conn->c_outgoing = 0; - } - rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + mutex_lock(&rs_tcp->t_conn_lock); + conn_state = rds_conn_state(conn); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP) + goto rst_nsk; + if (rs_tcp->t_sock) { + /* Need to resolve a duelling SYN between peers. + * We have an outstanding SYN to this peer, which may + * potentially have transitioned to the RDS_CONN_UP state, + * so we must quiesce any send threads before resetting + * c_transport_data. + */ + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + goto rst_nsk; + } else if (rs_tcp->t_sock) { + rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + conn->c_outgoing = 0; + } + } rds_tcp_set_callbacks(new_sock, conn); - rds_connect_complete(conn); + rds_connect_complete(conn); /* marks RDS_CONN_UP */ + new_sock = NULL; + ret = 0; + goto out; +rst_nsk: + /* reset the newly returned accept sock and bail */ + nsk = new_sock->sk; + rds_tcp_stats_inc(s_tcp_listen_closed_stale); + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); new_sock = NULL; ret = 0; - out: + if (rs_tcp) + mutex_unlock(&rs_tcp->t_conn_lock); if (new_sock) sock_release(new_sock); return ret; diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig index 598d374f6..868f1ad04 100644 --- a/net/rfkill/Kconfig +++ b/net/rfkill/Kconfig @@ -41,5 +41,4 @@ config RFKILL_GPIO default n help If you say yes here you get support of a generic gpio RFKILL - driver. The platform should fill in the appropriate fields in the - rfkill_gpio_platform_data structure and pass that to the driver. + driver. diff --git a/net/rfkill/core.c b/net/rfkill/core.c index cf5b69ab1..03f26e3a6 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -57,6 +57,8 @@ struct rfkill { bool registered; bool persistent; + bool polling_paused; + bool suspended; const struct rfkill_ops *ops; void *data; @@ -233,29 +235,6 @@ static void rfkill_event(struct rfkill *rfkill) rfkill_send_events(rfkill, RFKILL_OP_CHANGE); } -static bool __rfkill_set_hw_state(struct rfkill *rfkill, - bool blocked, bool *change) -{ - unsigned long flags; - bool prev, any; - - BUG_ON(!rfkill); - - spin_lock_irqsave(&rfkill->lock, flags); - prev = !!(rfkill->state & RFKILL_BLOCK_HW); - if (blocked) - rfkill->state |= RFKILL_BLOCK_HW; - else - rfkill->state &= ~RFKILL_BLOCK_HW; - *change = prev != blocked; - any = !!(rfkill->state & RFKILL_BLOCK_ANY); - spin_unlock_irqrestore(&rfkill->lock, flags); - - rfkill_led_trigger_event(rfkill); - - return any; -} - /** * rfkill_set_block - wrapper for set_block method * @@ -285,7 +264,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_lock_irqsave(&rfkill->lock, flags); prev = rfkill->state & RFKILL_BLOCK_SW; - if (rfkill->state & RFKILL_BLOCK_SW) + if (prev) rfkill->state |= RFKILL_BLOCK_SW_PREV; else rfkill->state &= ~RFKILL_BLOCK_SW_PREV; @@ -303,8 +282,8 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_lock_irqsave(&rfkill->lock, flags); if (err) { /* - * Failed -- reset status to _prev, this may be different - * from what set set _PREV to earlier in this function + * Failed -- reset status to _PREV, which may be different + * from what we have set _PREV to earlier in this function * if rfkill_set_sw_state was invoked. */ if (rfkill->state & RFKILL_BLOCK_SW_PREV) @@ -323,6 +302,19 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) rfkill_event(rfkill); } +static void rfkill_update_global_state(enum rfkill_type type, bool blocked) +{ + int i; + + if (type != RFKILL_TYPE_ALL) { + rfkill_global_states[type].cur = blocked; + return; + } + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = blocked; +} + #ifdef CONFIG_RFKILL_INPUT static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); @@ -332,8 +324,7 @@ static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); * @blocked: the new state * * This function sets the state of all switches of given type, - * unless a specific switch is claimed by userspace (in which case, - * that switch is left alone) or suspended. + * unless a specific switch is suspended. * * Caller must have acquired rfkill_global_mutex. */ @@ -341,15 +332,7 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; - if (type == RFKILL_TYPE_ALL) { - int i; - - for (i = 0; i < NUM_RFKILL_TYPES; i++) - rfkill_global_states[i].cur = blocked; - } else { - rfkill_global_states[type].cur = blocked; - } - + rfkill_update_global_state(type, blocked); list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; @@ -477,17 +460,28 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type) } #endif - bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) { - bool ret, change; + unsigned long flags; + bool ret, prev; + + BUG_ON(!rfkill); - ret = __rfkill_set_hw_state(rfkill, blocked, &change); + spin_lock_irqsave(&rfkill->lock, flags); + prev = !!(rfkill->state & RFKILL_BLOCK_HW); + if (blocked) + rfkill->state |= RFKILL_BLOCK_HW; + else + rfkill->state &= ~RFKILL_BLOCK_HW; + ret = !!(rfkill->state & RFKILL_BLOCK_ANY); + spin_unlock_irqrestore(&rfkill->lock, flags); + + rfkill_led_trigger_event(rfkill); if (!rfkill->registered) return ret; - if (change) + if (prev != blocked) schedule_work(&rfkill->uevent_work); return ret; @@ -582,6 +576,34 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) } EXPORT_SYMBOL(rfkill_set_states); +static const char * const rfkill_types[] = { + NULL, /* RFKILL_TYPE_ALL */ + "wlan", + "bluetooth", + "ultrawideband", + "wimax", + "wwan", + "gps", + "fm", + "nfc", +}; + +enum rfkill_type rfkill_find_type(const char *name) +{ + int i; + + BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES); + + if (!name) + return RFKILL_TYPE_ALL; + + for (i = 1; i < NUM_RFKILL_TYPES; i++) + if (!strcmp(name, rfkill_types[i])) + return i; + return RFKILL_TYPE_ALL; +} +EXPORT_SYMBOL(rfkill_find_type); + static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -591,38 +613,12 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(name); -static const char *rfkill_get_type_str(enum rfkill_type type) -{ - BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1); - - switch (type) { - case RFKILL_TYPE_WLAN: - return "wlan"; - case RFKILL_TYPE_BLUETOOTH: - return "bluetooth"; - case RFKILL_TYPE_UWB: - return "ultrawideband"; - case RFKILL_TYPE_WIMAX: - return "wimax"; - case RFKILL_TYPE_WWAN: - return "wwan"; - case RFKILL_TYPE_GPS: - return "gps"; - case RFKILL_TYPE_FM: - return "fm"; - case RFKILL_TYPE_NFC: - return "nfc"; - default: - BUG(); - } -} - static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type)); + return sprintf(buf, "%s\n", rfkill_types[rfkill->type]); } static DEVICE_ATTR_RO(type); @@ -730,20 +726,12 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(state); -static ssize_t claim_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", 0); -} -static DEVICE_ATTR_RO(claim); - static struct attribute *rfkill_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_type.attr, &dev_attr_index.attr, &dev_attr_persistent.attr, &dev_attr_state.attr, - &dev_attr_claim.attr, &dev_attr_soft.attr, &dev_attr_hard.attr, NULL, @@ -768,7 +756,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env) if (error) return error; error = add_uevent_var(env, "RFKILL_TYPE=%s", - rfkill_get_type_str(rfkill->type)); + rfkill_types[rfkill->type]); if (error) return error; spin_lock_irqsave(&rfkill->lock, flags); @@ -786,6 +774,7 @@ void rfkill_pause_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; + rfkill->polling_paused = true; cancel_delayed_work_sync(&rfkill->poll_work); } EXPORT_SYMBOL(rfkill_pause_polling); @@ -797,6 +786,11 @@ void rfkill_resume_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; + rfkill->polling_paused = false; + + if (rfkill->suspended) + return; + queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, 0); } @@ -807,7 +801,8 @@ static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); - rfkill_pause_polling(rfkill); + rfkill->suspended = true; + cancel_delayed_work_sync(&rfkill->poll_work); return 0; } @@ -817,12 +812,16 @@ static int rfkill_resume(struct device *dev) struct rfkill *rfkill = to_rfkill(dev); bool cur; + rfkill->suspended = false; + if (!rfkill->persistent) { cur = !!(rfkill->state & RFKILL_BLOCK_SW); rfkill_set_block(rfkill, cur); } - rfkill_resume_polling(rfkill); + if (rfkill->ops->poll && !rfkill->polling_paused) + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, 0); return 0; } @@ -1164,15 +1163,8 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, mutex_lock(&rfkill_global_mutex); - if (ev.op == RFKILL_OP_CHANGE_ALL) { - if (ev.type == RFKILL_TYPE_ALL) { - enum rfkill_type i; - for (i = 0; i < NUM_RFKILL_TYPES; i++) - rfkill_global_states[i].cur = ev.soft; - } else { - rfkill_global_states[ev.type].cur = ev.soft; - } - } + if (ev.op == RFKILL_OP_CHANGE_ALL) + rfkill_update_global_state(ev.type, ev.soft); list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL) @@ -1261,10 +1253,8 @@ static struct miscdevice rfkill_miscdev = { static int __init rfkill_init(void) { int error; - int i; - for (i = 0; i < NUM_RFKILL_TYPES; i++) - rfkill_global_states[i].cur = !rfkill_default_state; + rfkill_update_global_state(RFKILL_TYPE_ALL, !rfkill_default_state); error = class_register(&rfkill_class); if (error) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 4b1e3f35f..76c01cbd5 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -27,8 +27,6 @@ #include <linux/acpi.h> #include <linux/gpio/consumer.h> -#include <linux/rfkill-gpio.h> - struct rfkill_gpio_data { const char *name; enum rfkill_type type; @@ -81,7 +79,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev, if (!id) return -ENODEV; - rfkill->name = dev_name(dev); rfkill->type = (unsigned)id->driver_data; return acpi_dev_add_driver_gpios(ACPI_COMPANION(dev), @@ -90,24 +87,27 @@ static int rfkill_gpio_acpi_probe(struct device *dev, static int rfkill_gpio_probe(struct platform_device *pdev) { - struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data; struct rfkill_gpio_data *rfkill; struct gpio_desc *gpio; + const char *type_name; int ret; rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL); if (!rfkill) return -ENOMEM; + device_property_read_string(&pdev->dev, "name", &rfkill->name); + device_property_read_string(&pdev->dev, "type", &type_name); + + if (!rfkill->name) + rfkill->name = dev_name(&pdev->dev); + + rfkill->type = rfkill_find_type(type_name); + if (ACPI_HANDLE(&pdev->dev)) { ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill); if (ret) return ret; - } else if (pdata) { - rfkill->name = pdata->name; - rfkill->type = pdata->type; - } else { - return -ENODEV; } rfkill->clk = devm_clk_get(&pdev->dev, NULL); @@ -124,10 +124,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->shutdown_gpio = gpio; - /* Make sure at-least one of the GPIO is defined and that - * a name is specified for this instance - */ - if ((!rfkill->reset_gpio && !rfkill->shutdown_gpio) || !rfkill->name) { + /* Make sure at-least one GPIO is defined for this instance */ + if (!rfkill->reset_gpio && !rfkill->shutdown_gpio) { dev_err(&pdev->dev, "invalid platform data\n"); return -EINVAL; } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7e2d1057d..9d935fa5a 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -37,7 +37,7 @@ static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; /* local epoch for detecting local-end reset */ -__be32 rxrpc_epoch; +u32 rxrpc_epoch; /* current debugging ID */ atomic_t rxrpc_debug_id; @@ -81,6 +81,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, int len) { + unsigned int tail; + if (len < sizeof(struct sockaddr_rxrpc)) return -EINVAL; @@ -103,9 +105,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, _debug("INET: %x @ %pI4", ntohs(srx->transport.sin.sin_port), &srx->transport.sin.sin_addr); - if (srx->transport_len > 8) - memset((void *)&srx->transport + 8, 0, - srx->transport_len - 8); + tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); break; case AF_INET6: @@ -113,6 +113,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, return -EAFNOSUPPORT; } + if (tail < len) + memset((void *)srx + tail, 0, len - tail); return 0; } @@ -121,11 +123,10 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, */ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) { - struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr; + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct sock *sk = sock->sk; struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; - __be16 service_id; int ret; _enter("%p,%p,%d", rx, saddr, len); @@ -143,7 +144,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) memcpy(&rx->srx, srx, sizeof(rx->srx)); - /* find a local transport endpoint if we don't have one already */ + /* Find or create a local transport endpoint to use */ local = rxrpc_lookup_local(&rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); @@ -152,14 +153,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) rx->local = local; if (srx->srx_service) { - service_id = htons(srx->srx_service); write_lock_bh(&local->services_lock); list_for_each_entry(prx, &local->services, listen_link) { - if (prx->service_id == service_id) + if (prx->srx.srx_service == srx->srx_service) goto service_in_use; } - rx->service_id = service_id; list_add_tail(&rx->listen_link, &local->services); write_unlock_bh(&local->services_lock); @@ -276,7 +275,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_transport *trans; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - __be16 service_id; _enter(",,%x,%lx", key_serial(key), user_call_ID); @@ -299,16 +297,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, atomic_inc(&trans->usage); } - service_id = rx->service_id; - if (srx) - service_id = htons(srx->srx_service); - + if (!srx) + srx = &rx->srx; if (!key) key = rx->key; if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ - bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp); + bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp); if (IS_ERR(bundle)) { call = ERR_CAST(bundle); goto out; @@ -324,7 +320,6 @@ out_notrans: _leave(" = %p", call); return call; } - EXPORT_SYMBOL(rxrpc_kernel_begin_call); /** @@ -340,7 +335,6 @@ void rxrpc_kernel_end_call(struct rxrpc_call *call) rxrpc_remove_user_ID(call->socket, call); rxrpc_put_call(call); } - EXPORT_SYMBOL(rxrpc_kernel_end_call); /** @@ -425,7 +419,6 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, } rx->trans = trans; - rx->service_id = htons(srx->srx_service); rx->sk.sk_state = RXRPC_CLIENT_CONNECTED; release_sock(&rx->sk); @@ -622,7 +615,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; - /* we support transport protocol UDP only */ + /* we support transport protocol UDP/UDP6 only */ if (protocol != PF_INET) return -EPROTONOSUPPORT; @@ -754,7 +747,7 @@ static int rxrpc_release(struct socket *sock) * RxRPC network protocol */ static const struct proto_ops rxrpc_rpc_ops = { - .family = PF_UNIX, + .family = PF_RXRPC, .owner = THIS_MODULE, .release = rxrpc_release, .bind = rxrpc_bind, @@ -778,7 +771,7 @@ static struct proto rxrpc_proto = { .name = "RXRPC", .owner = THIS_MODULE, .obj_size = sizeof(struct rxrpc_sock), - .max_header = sizeof(struct rxrpc_header), + .max_header = sizeof(struct rxrpc_wire_header), }; static const struct net_proto_family rxrpc_family_ops = { @@ -796,7 +789,7 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); - rxrpc_epoch = htonl(get_seconds()); + rxrpc_epoch = get_seconds(); ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c index 6d79310fc..277731a5e 100644 --- a/net/rxrpc/ar-accept.c +++ b/net/rxrpc/ar-accept.c @@ -27,7 +27,7 @@ * generate a connection-level abort */ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, - struct rxrpc_header *hdr) + struct rxrpc_wire_header *whdr) { struct msghdr msg; struct kvec iov[1]; @@ -36,25 +36,21 @@ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, _enter("%d,,", local->debug_id); + whdr->type = RXRPC_PACKET_TYPE_BUSY; + whdr->serial = htonl(1); + msg.msg_name = &srx->transport.sin; msg.msg_namelen = sizeof(srx->transport.sin); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; - hdr->seq = 0; - hdr->type = RXRPC_PACKET_TYPE_BUSY; - hdr->flags = 0; - hdr->userStatus = 0; - hdr->_rsvd = 0; - - iov[0].iov_base = hdr; - iov[0].iov_len = sizeof(*hdr); + iov[0].iov_base = whdr; + iov[0].iov_len = sizeof(*whdr); len = iov[0].iov_len; - hdr->serial = htonl(1); - _proto("Tx BUSY %%%u", ntohl(hdr->serial)); + _proto("Tx BUSY %%1"); ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); if (ret < 0) { @@ -185,8 +181,8 @@ invalid_service: read_unlock_bh(&local->services_lock); read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) { + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { rxrpc_get_call(call); rxrpc_queue_call(call); } @@ -211,8 +207,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work) struct rxrpc_skb_priv *sp; struct sockaddr_rxrpc srx; struct rxrpc_sock *rx; + struct rxrpc_wire_header whdr; struct sk_buff *skb; - __be16 service_id; int ret; _enter("%d", local->debug_id); @@ -240,6 +236,19 @@ process_next_packet: sp = rxrpc_skb(skb); + /* Set up a response packet header in case we need it */ + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = 0; + whdr.flags = 0; + whdr.type = 0; + whdr.userStatus = 0; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + /* determine the remote address */ memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -256,10 +265,9 @@ process_next_packet: } /* get the socket providing the service */ - service_id = sp->hdr.serviceId; read_lock_bh(&local->services_lock); list_for_each_entry(rx, &local->services, listen_link) { - if (rx->service_id == service_id && + if (rx->srx.srx_service == sp->hdr.serviceId && rx->sk.sk_state != RXRPC_CLOSE) goto found_service; } @@ -267,7 +275,7 @@ process_next_packet: goto invalid_service; found_service: - _debug("found service %hd", ntohs(rx->service_id)); + _debug("found service %hd", rx->srx.srx_service); if (sk_acceptq_is_full(&rx->sk)) goto backlog_full; sk_acceptq_added(&rx->sk); @@ -296,7 +304,7 @@ found_service: backlog_full: read_unlock_bh(&local->services_lock); busy: - rxrpc_busy(local, &srx, &sp->hdr); + rxrpc_busy(local, &srx, &whdr); rxrpc_free_skb(skb); goto process_next_packet; @@ -379,7 +387,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, rb_insert_color(&call->sock_node, &rx->calls); if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) BUG(); - if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events)) + if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) BUG(); rxrpc_queue_call(call); @@ -395,7 +403,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, out_release: _debug("release %p", call); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); out_discard: write_unlock_bh(&call->state_lock); @@ -407,7 +415,7 @@ out: } /* - * handle rejectance of a call by userspace + * Handle rejection of a call by userspace * - reject the call at the front of the queue */ int rxrpc_reject_call(struct rxrpc_sock *rx) @@ -434,7 +442,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: call->state = RXRPC_CALL_SERVER_BUSY; - if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) + if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) rxrpc_queue_call(call); ret = 0; goto out_release; @@ -458,7 +466,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) out_release: _debug("release %p", call); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); out_discard: write_unlock_bh(&call->state_lock); @@ -487,7 +495,6 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, _leave(" = %p", call); return call; } - EXPORT_SYMBOL(rxrpc_kernel_accept_call); /** @@ -506,5 +513,4 @@ int rxrpc_kernel_reject_call(struct socket *sock) _leave(" = %d", ret); return ret; } - EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index adc555e03..16d967075 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -23,7 +23,7 @@ * How long to wait before scheduling ACK generation after seeing a * packet with RXRPC_REQUEST_ACK set (in jiffies). */ -unsigned rxrpc_requested_ack_delay = 1; +unsigned int rxrpc_requested_ack_delay = 1; /* * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). @@ -32,7 +32,7 @@ unsigned rxrpc_requested_ack_delay = 1; * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned rxrpc_soft_ack_delay = 1 * HZ; +unsigned int rxrpc_soft_ack_delay = 1 * HZ; /* * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). @@ -41,7 +41,7 @@ unsigned rxrpc_soft_ack_delay = 1 * HZ; * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned rxrpc_idle_ack_delay = 0.5 * HZ; +unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; /* * Receive window size in packets. This indicates the maximum number of @@ -49,19 +49,19 @@ unsigned rxrpc_idle_ack_delay = 0.5 * HZ; * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further * packets. */ -unsigned rxrpc_rx_window_size = 32; +unsigned int rxrpc_rx_window_size = 32; /* * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = 5692; /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 4; static const char *rxrpc_acks(u8 reason) { @@ -91,7 +91,7 @@ static const s8 rxrpc_ack_priority[] = { * propose an ACK be sent */ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - __be32 serial, bool immediate) + u32 serial, bool immediate) { unsigned long expiry; s8 prior = rxrpc_ack_priority[ack_reason]; @@ -99,8 +99,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, ASSERTCMP(prior, >, 0); _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), ntohl(serial), - immediate); + call->debug_id, rxrpc_acks(ack_reason), serial, immediate); if (prior < rxrpc_ack_priority[call->ackr_reason]) { if (immediate) @@ -139,7 +138,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, expiry = rxrpc_requested_ack_delay; if (!expiry) goto cancel_timer; - if (!immediate || serial == cpu_to_be32(1)) { + if (!immediate || serial == 1) { _debug("run defer timer"); goto run_timer; } @@ -157,11 +156,11 @@ run_timer: return; cancel_timer: - _debug("cancel timer %%%u", ntohl(serial)); + _debug("cancel timer %%%u", serial); try_to_del_timer_sync(&call->ack_timer); read_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } @@ -170,7 +169,7 @@ cancel_timer: * propose an ACK be sent, locking the call structure */ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - __be32 serial, bool immediate) + u32 serial, bool immediate) { s8 prior = rxrpc_ack_priority[ack_reason]; @@ -193,7 +192,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, if (resend & 1) { _debug("SET RESEND"); - set_bit(RXRPC_CALL_RESEND, &call->events); + set_bit(RXRPC_CALL_EV_RESEND, &call->events); } if (resend & 2) { @@ -203,7 +202,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, } else { _debug("KILL RESEND TIMER"); del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); } read_unlock_bh(&call->state_lock); @@ -214,8 +213,8 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, */ static void rxrpc_resend(struct rxrpc_call *call) { + struct rxrpc_wire_header *whdr; struct rxrpc_skb_priv *sp; - struct rxrpc_header *hdr; struct sk_buff *txb; unsigned long *p_txb, resend_at; bool stop; @@ -247,14 +246,13 @@ static void rxrpc_resend(struct rxrpc_call *call) sp->need_resend = false; /* each Tx packet has a new serial number */ - sp->hdr.serial = - htonl(atomic_inc_return(&call->conn->serial)); + sp->hdr.serial = atomic_inc_return(&call->conn->serial); - hdr = (struct rxrpc_header *) txb->head; - hdr->serial = sp->hdr.serial; + whdr = (struct rxrpc_wire_header *)txb->head; + whdr->serial = htonl(sp->hdr.serial); _proto("Tx DATA %%%u { #%d }", - ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + sp->hdr.serial, sp->hdr.seq); if (rxrpc_send_packet(call->conn->trans, txb) < 0) { stop = true; sp->resend_at = jiffies + 3; @@ -428,7 +426,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) int tail = call->acks_tail, old_tail; int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); - _enter("{%u,%u},%u", call->acks_hard, win, hard); + kenter("{%u,%u},%u", call->acks_hard, win, hard); ASSERTCMP(hard - call->acks_hard, <=, win); @@ -478,11 +476,11 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) sp = rxrpc_skb(skb); _debug("drain OOS packet %d [%d]", - ntohl(sp->hdr.seq), call->rx_first_oos); + sp->hdr.seq, call->rx_first_oos); - if (ntohl(sp->hdr.seq) != call->rx_first_oos) { + if (sp->hdr.seq != call->rx_first_oos) { skb_queue_head(&call->rx_oos_queue, skb); - call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq); + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; _debug("requeue %p {%u}", skb, call->rx_first_oos); } else { skb->mark = RXRPC_SKB_MARK_DATA; @@ -496,8 +494,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) /* find out what the next packet is */ skb = skb_peek(&call->rx_oos_queue); if (skb) - call->rx_first_oos = - ntohl(rxrpc_skb(skb)->hdr.seq); + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; else call->rx_first_oos = 0; _debug("peek %p {%u}", skb, call->rx_first_oos); @@ -522,7 +519,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call, u32 seq; sp = rxrpc_skb(skb); - seq = ntohl(sp->hdr.seq); + seq = sp->hdr.seq; _enter(",,{%u}", seq); skb->destructor = rxrpc_packet_destructor; @@ -535,9 +532,8 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call, skb_queue_walk(&call->rx_oos_queue, p) { psp = rxrpc_skb(p); - if (ntohl(psp->hdr.seq) > seq) { - _debug("insert oos #%u before #%u", - seq, ntohl(psp->hdr.seq)); + if (psp->hdr.seq > seq) { + _debug("insert oos #%u before #%u", seq, psp->hdr.seq); skb_insert(p, skb, &call->rx_oos_queue); goto inserted; } @@ -555,7 +551,7 @@ inserted: if (call->state < RXRPC_CALL_COMPLETE && call->rx_data_post == call->rx_first_oos) { _debug("drain rx oos now"); - set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); } read_unlock(&call->state_lock); @@ -586,7 +582,7 @@ static void rxrpc_zap_tx_window(struct rxrpc_call *call) skb = (struct sk_buff *) _skb; sp = rxrpc_skb(skb); - _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); + _debug("+++ clear Tx %u", sp->hdr.seq); rxrpc_free_skb(skb); } @@ -657,8 +653,7 @@ process_further: /* data packets that wind up here have been received out of * order, need security processing or are jumbo packets */ case RXRPC_PACKET_TYPE_DATA: - _proto("OOSQ DATA %%%u { #%u }", - ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); /* secured packets must be verified and possibly decrypted */ if (rxrpc_verify_packet(call, skb, _abort_code) < 0) @@ -676,7 +671,7 @@ process_further: if (!skb_pull(skb, sizeof(ack))) BUG(); - latest = ntohl(sp->hdr.serial); + latest = sp->hdr.serial; hard = ntohl(ack.firstPacket); tx = atomic_read(&call->sequence); @@ -793,7 +788,7 @@ all_acked: del_timer_sync(&call->resend_timer); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); if (call->acks_window) rxrpc_zap_tx_window(call); @@ -881,16 +876,17 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); + struct rxrpc_wire_header whdr; struct rxrpc_ackpacket ack; struct rxrpc_ackinfo ackinfo; - struct rxrpc_header hdr; struct msghdr msg; struct kvec iov[5]; + enum rxrpc_call_event genbit; unsigned long bits; __be32 data, pad; size_t len; - int genbit, loop, nbit, ioc, ret, mtu; - u32 abort_code = RX_PROTOCOL_ERROR; + int loop, nbit, ioc, ret, mtu; + u32 serial, abort_code = RX_PROTOCOL_ERROR; u8 *acks = NULL; //printk("\n--------------------\n"); @@ -911,33 +907,33 @@ void rxrpc_process_call(struct work_struct *work) msg.msg_controllen = 0; msg.msg_flags = 0; - hdr.epoch = call->conn->epoch; - hdr.cid = call->cid; - hdr.callNumber = call->call_id; - hdr.seq = 0; - hdr.type = RXRPC_PACKET_TYPE_ACK; - hdr.flags = call->conn->out_clientflag; - hdr.userStatus = 0; - hdr.securityIndex = call->conn->security_ix; - hdr._rsvd = 0; - hdr.serviceId = call->conn->service_id; + whdr.epoch = htonl(call->conn->epoch); + whdr.cid = htonl(call->cid); + whdr.callNumber = htonl(call->call_id); + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ACK; + whdr.flags = call->conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = call->conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(call->service_id); memset(iov, 0, sizeof(iov)); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); /* deal with events of a final nature */ - if (test_bit(RXRPC_CALL_RELEASE, &call->events)) { + if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { rxrpc_release_call(call); - clear_bit(RXRPC_CALL_RELEASE, &call->events); + clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); } - if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) { + if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { int error; - clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); - clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); error = call->conn->trans->peer->net_error; _debug("post net error %d", error); @@ -945,47 +941,47 @@ void rxrpc_process_call(struct work_struct *work) if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, error, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); goto kill_ACKs; } - if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) { + if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); - clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); _debug("post conn abort"); if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, call->conn->error, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); goto kill_ACKs; } - if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) { - hdr.type = RXRPC_PACKET_TYPE_BUSY; - genbit = RXRPC_CALL_REJECT_BUSY; + if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { + whdr.type = RXRPC_PACKET_TYPE_BUSY; + genbit = RXRPC_CALL_EV_REJECT_BUSY; goto send_message; } - if (test_bit(RXRPC_CALL_ABORT, &call->events)) { + if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, ECONNABORTED, true) < 0) goto no_mem; - hdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.type = RXRPC_PACKET_TYPE_ABORT; data = htonl(call->abort_code); iov[1].iov_base = &data; iov[1].iov_len = sizeof(data); - genbit = RXRPC_CALL_ABORT; + genbit = RXRPC_CALL_EV_ABORT; goto send_message; } - if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) { - genbit = RXRPC_CALL_ACK_FINAL; + if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { + genbit = RXRPC_CALL_EV_ACK_FINAL; ack.bufferSpace = htons(8); ack.maxSkew = 0; @@ -995,9 +991,9 @@ void rxrpc_process_call(struct work_struct *work) call->ackr_reason = 0; spin_lock_bh(&call->lock); - ack.serial = call->ackr_serial; - ack.previousPacket = call->ackr_prev_seq; - ack.firstPacket = htonl(call->rx_data_eaten + 1); + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); + ack.firstPacket = htonl(call->rx_data_eaten + 1); spin_unlock_bh(&call->lock); pad = 0; @@ -1011,12 +1007,12 @@ void rxrpc_process_call(struct work_struct *work) goto send_ACK; } - if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) | - (1 << RXRPC_CALL_RCVD_ABORT)) + if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | + (1 << RXRPC_CALL_EV_RCVD_ABORT)) ) { u32 mark; - if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events)) + if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) mark = RXRPC_SKB_MARK_REMOTE_ABORT; else mark = RXRPC_SKB_MARK_BUSY; @@ -1026,22 +1022,22 @@ void rxrpc_process_call(struct work_struct *work) if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events); - clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); goto kill_ACKs; } - if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { _debug("do implicit ackall"); rxrpc_clear_tx_window(call); } - if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) { + if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { write_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_CALL_TIMEOUT; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); } write_unlock_bh(&call->state_lock); @@ -1050,7 +1046,7 @@ void rxrpc_process_call(struct work_struct *work) ETIME, true) < 0) goto no_mem; - clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); goto kill_ACKs; } @@ -1071,13 +1067,13 @@ void rxrpc_process_call(struct work_struct *work) } /* handle resending */ - if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) rxrpc_resend_timer(call); - if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_resend(call); /* consider sending an ordinary ACK */ - if (test_bit(RXRPC_CALL_ACK, &call->events)) { + if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { _debug("send ACK: window: %d - %d { %lx }", call->rx_data_eaten, call->ackr_win_top, call->ackr_window[0]); @@ -1085,11 +1081,11 @@ void rxrpc_process_call(struct work_struct *work) if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { /* ACK by sending reply DATA packet in this state */ - clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); goto maybe_reschedule; } - genbit = RXRPC_CALL_ACK; + genbit = RXRPC_CALL_EV_ACK; acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, GFP_NOFS); @@ -1099,13 +1095,11 @@ void rxrpc_process_call(struct work_struct *work) //hdr.flags = RXRPC_SLOW_START_OK; ack.bufferSpace = htons(8); ack.maxSkew = 0; - ack.serial = 0; - ack.reason = 0; spin_lock_bh(&call->lock); - ack.reason = call->ackr_reason; - ack.serial = call->ackr_serial; - ack.previousPacket = call->ackr_prev_seq; + ack.reason = call->ackr_reason; + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); ack.firstPacket = htonl(call->rx_data_eaten + 1); ack.nAcks = 0; @@ -1152,7 +1146,7 @@ void rxrpc_process_call(struct work_struct *work) /* handle completion of security negotiations on an incoming * connection */ - if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { _debug("secured"); spin_lock_bh(&call->lock); @@ -1160,7 +1154,7 @@ void rxrpc_process_call(struct work_struct *work) _debug("securing"); write_lock(&call->conn->lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_bit(RXRPC_CALL_RELEASE, &call->events)) { + !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { _debug("not released"); call->state = RXRPC_CALL_SERVER_ACCEPTING; list_move_tail(&call->accept_link, @@ -1169,39 +1163,39 @@ void rxrpc_process_call(struct work_struct *work) write_unlock(&call->conn->lock); read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); read_unlock(&call->state_lock); } spin_unlock_bh(&call->lock); - if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) + if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) goto maybe_reschedule; } /* post a notification of an acceptable connection to the app */ - if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) { + if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { _debug("post accept"); if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, 0, false) < 0) goto no_mem; - clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); goto maybe_reschedule; } /* handle incoming call acceptance */ - if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { _debug("accepted"); ASSERTCMP(call->rx_data_post, ==, 0); call->rx_data_post = 1; read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); read_unlock_bh(&call->state_lock); } /* drain the out of sequence received packet queue into the packet Rx * queue */ - if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) { + if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { while (call->rx_data_post == call->rx_first_oos) if (rxrpc_drain_rx_oos_queue(call) < 0) break; @@ -1224,9 +1218,10 @@ send_ACK: ackinfo.rxMTU = htonl(rxrpc_rx_mtu); ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); - hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - ntohl(hdr.serial), + serial, ntohs(ack.maxSkew), ntohl(ack.firstPacket), ntohl(ack.previousPacket), @@ -1242,8 +1237,9 @@ send_ACK: send_message: _debug("send message"); - hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); - _proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial)); + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); + _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); send_message_2: len = iov[0].iov_len; @@ -1280,12 +1276,12 @@ send_message_2: } switch (genbit) { - case RXRPC_CALL_ABORT: + case RXRPC_CALL_EV_ABORT: clear_bit(genbit, &call->events); - clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); goto kill_ACKs; - case RXRPC_CALL_ACK_FINAL: + case RXRPC_CALL_EV_ACK_FINAL: write_lock_bh(&call->state_lock); if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) call->state = RXRPC_CALL_COMPLETE; @@ -1310,9 +1306,9 @@ send_message_2: kill_ACKs: del_timer_sync(&call->ack_timer); - if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) rxrpc_put_call(call); - clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); maybe_reschedule: if (call->events || !skb_queue_empty(&call->rx_queue)) { @@ -1326,12 +1322,11 @@ maybe_reschedule: if (call->state >= RXRPC_CALL_COMPLETE && !list_empty(&call->accept_link)) { _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", - call, call->events, call->flags, - ntohl(call->conn->cid)); + call, call->events, call->flags, call->conn->cid); read_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } @@ -1345,7 +1340,7 @@ error: * this means there's a race between clearing the flag and setting the * work pending bit and the work item being processed again */ if (call->events && !work_pending(&call->processor)) { - _debug("jumpstart %x", ntohl(call->conn->cid)); + _debug("jumpstart %x", call->conn->cid); rxrpc_queue_call(call); } diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index a9e05db0f..7c8d300ad 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -21,14 +21,14 @@ /* * Maximum lifetime of a call (in jiffies). */ -unsigned rxrpc_max_call_lifetime = 60 * HZ; +unsigned int rxrpc_max_call_lifetime = 60 * HZ; /* * Time till dead call expires after last use (in jiffies). */ -unsigned rxrpc_dead_call_expiry = 2 * HZ; +unsigned int rxrpc_dead_call_expiry = 2 * HZ; -const char *const rxrpc_call_states[] = { +const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", @@ -64,11 +64,11 @@ static DEFINE_HASHTABLE(rxrpc_call_hash, 10); * Hash function for rxrpc_call_hash */ static unsigned long rxrpc_call_hashfunc( - u8 clientflag, - __be32 cid, - __be32 call_id, - __be32 epoch, - __be16 service_id, + u8 in_clientflag, + u32 cid, + u32 call_id, + u32 epoch, + u16 service_id, sa_family_t proto, void *localptr, unsigned int addr_size, @@ -77,7 +77,6 @@ static unsigned long rxrpc_call_hashfunc( const u16 *p; unsigned int i; unsigned long key; - u32 hcid = ntohl(cid); _enter(""); @@ -85,12 +84,12 @@ static unsigned long rxrpc_call_hashfunc( /* We just want to add up the __be32 values, so forcing the * cast should be okay. */ - key += (__force u32)epoch; - key += (__force u16)service_id; - key += (__force u32)call_id; - key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; - key += hcid & RXRPC_CHANNELMASK; - key += clientflag; + key += epoch; + key += service_id; + key += call_id; + key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; + key += cid & RXRPC_CHANNELMASK; + key += in_clientflag; key += proto; /* Step through the peer address in 16-bit portions for speed */ for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) @@ -148,19 +147,16 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call) * isn't there. */ struct rxrpc_call *rxrpc_find_call_hash( - u8 clientflag, - __be32 cid, - __be32 call_id, - __be32 epoch, - __be16 service_id, + struct rxrpc_host_header *hdr, void *localptr, sa_family_t proto, - const u8 *peer_addr) + const void *peer_addr) { unsigned long key; unsigned int addr_size = 0; struct rxrpc_call *call = NULL; struct rxrpc_call *ret = NULL; + u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED; _enter(""); switch (proto) { @@ -174,20 +170,21 @@ struct rxrpc_call *rxrpc_find_call_hash( break; } - key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch, - service_id, proto, localptr, addr_size, + key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber, + hdr->epoch, hdr->serviceId, + proto, localptr, addr_size, peer_addr); hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { if (call->hash_key == key && - call->call_id == call_id && - call->cid == cid && - call->in_clientflag == clientflag && - call->service_id == service_id && + call->call_id == hdr->callNumber && + call->cid == hdr->cid && + call->in_clientflag == in_clientflag && + call->service_id == hdr->serviceId && call->proto == proto && call->local == localptr && memcmp(call->peer_ip.ipv6_addr, peer_addr, - addr_size) == 0 && - call->epoch == epoch) { + addr_size) == 0 && + call->epoch == hdr->epoch) { ret = call; break; } @@ -414,12 +411,12 @@ found_extant_second: */ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_connection *conn, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, gfp_t gfp) { struct rxrpc_call *call, *candidate; struct rb_node **p, *parent; - __be32 call_id; + u32 call_id; _enter(",%d,,%x", conn->debug_id, gfp); @@ -433,7 +430,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, candidate->conn = conn; candidate->cid = hdr->cid; candidate->call_id = hdr->callNumber; - candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK; + candidate->channel = hdr->cid & RXRPC_CHANNELMASK; candidate->rx_data_post = 0; candidate->state = RXRPC_CALL_SERVER_ACCEPTING; if (conn->security_ix > 0) @@ -452,7 +449,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, read_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) rxrpc_queue_call(call); case RXRPC_CALL_REMOTELY_ABORTED: read_unlock(&call->state_lock); @@ -492,9 +489,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, /* The tree is sorted in order of the __be32 value without * turning it into host order. */ - if ((__force u32)call_id < (__force u32)call->call_id) + if (call_id < call->call_id) p = &(*p)->rb_left; - else if ((__force u32)call_id > (__force u32)call->call_id) + else if (call_id > call->call_id) p = &(*p)->rb_right; else goto old_call; @@ -686,7 +683,7 @@ void rxrpc_release_call(struct rxrpc_call *call) _debug("+++ ABORTING STATE %d +++\n", call->state); call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_CALL_DEAD; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } write_unlock(&call->state_lock); @@ -714,8 +711,7 @@ void rxrpc_release_call(struct rxrpc_call *call) _debug("- zap %s %%%u #%u", rxrpc_pkts[sp->hdr.type], - ntohl(sp->hdr.serial), - ntohl(sp->hdr.seq)); + sp->hdr.serial, sp->hdr.seq); rxrpc_free_skb(skb); spin_lock_bh(&call->lock); } @@ -763,10 +759,10 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call) _debug("abort call %p", call); call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_CALL_DEAD; - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) sched = true; } - if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) sched = true; if (sched) rxrpc_queue_call(call); @@ -873,9 +869,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) unsigned long _skb; _skb = call->acks_window[call->acks_tail] & ~1; - sp = rxrpc_skb((struct sk_buff *) _skb); - _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); - rxrpc_free_skb((struct sk_buff *) _skb); + sp = rxrpc_skb((struct sk_buff *)_skb); + _debug("+++ clear Tx %u", sp->hdr.seq); + rxrpc_free_skb((struct sk_buff *)_skb); call->acks_tail = (call->acks_tail + 1) & (call->acks_winsz - 1); } @@ -975,7 +971,7 @@ static void rxrpc_call_life_expired(unsigned long _call) _enter("{%d}", call->debug_id); read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { - set_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); rxrpc_queue_call(call); } read_unlock_bh(&call->state_lock); @@ -995,7 +991,7 @@ static void rxrpc_resend_time_expired(unsigned long _call) return; clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) rxrpc_queue_call(call); } @@ -1013,7 +1009,7 @@ static void rxrpc_ack_time_expired(unsigned long _call) read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 6c71ed1ca..9942da1ed 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -21,7 +21,7 @@ /* * Time till a connection expires after last use (in seconds). */ -unsigned rxrpc_connection_expiry = 10 * 60; +unsigned int rxrpc_connection_expiry = 10 * 60; static void rxrpc_connection_reaper(struct work_struct *work); @@ -57,10 +57,10 @@ static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) */ static inline int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, - struct key *key, __be16 service_id) + struct key *key, u16 service_id) { return (bundle->service_id - service_id) ?: - ((unsigned long) bundle->key - (unsigned long) key); + ((unsigned long)bundle->key - (unsigned long)key); } /* @@ -69,14 +69,14 @@ int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, struct rxrpc_transport *trans, struct key *key, - __be16 service_id, + u16 service_id, gfp_t gfp) { struct rxrpc_conn_bundle *bundle, *candidate; struct rb_node *p, *parent, **pp; _enter("%p{%x},%x,%hx,", - rx, key_serial(key), trans->debug_id, ntohs(service_id)); + rx, key_serial(key), trans->debug_id, service_id); if (rx->trans == trans && rx->bundle) { atomic_inc(&rx->bundle->usage); @@ -213,7 +213,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->avail_calls = RXRPC_MAXCALLS; conn->size_align = 4; - conn->header_size = sizeof(struct rxrpc_header); + conn->header_size = sizeof(struct rxrpc_wire_header); } _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); @@ -230,7 +230,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) struct rxrpc_connection *xconn; struct rb_node *parent, **p; __be32 epoch; - u32 real_conn_id; + u32 cid; _enter(""); @@ -241,7 +241,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) conn->trans->conn_idcounter += RXRPC_CID_INC; if (conn->trans->conn_idcounter < RXRPC_CID_INC) conn->trans->conn_idcounter = RXRPC_CID_INC; - real_conn_id = conn->trans->conn_idcounter; + cid = conn->trans->conn_idcounter; attempt_insertion: parent = NULL; @@ -255,9 +255,9 @@ attempt_insertion: p = &(*p)->rb_left; else if (epoch > xconn->epoch) p = &(*p)->rb_right; - else if (real_conn_id < xconn->real_conn_id) + else if (cid < xconn->cid) p = &(*p)->rb_left; - else if (real_conn_id > xconn->real_conn_id) + else if (cid > xconn->cid) p = &(*p)->rb_right; else goto id_exists; @@ -268,20 +268,19 @@ attempt_insertion: rb_link_node(&conn->node, parent, p); rb_insert_color(&conn->node, &conn->trans->client_conns); - conn->real_conn_id = real_conn_id; - conn->cid = htonl(real_conn_id); + conn->cid = cid; write_unlock_bh(&conn->trans->conn_lock); - _leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid)); + _leave(" [CID %x]", cid); return; /* we found a connection with the proposed ID - walk the tree from that * point looking for the next unused ID */ id_exists: for (;;) { - real_conn_id += RXRPC_CID_INC; - if (real_conn_id < RXRPC_CID_INC) { - real_conn_id = RXRPC_CID_INC; - conn->trans->conn_idcounter = real_conn_id; + cid += RXRPC_CID_INC; + if (cid < RXRPC_CID_INC) { + cid = RXRPC_CID_INC; + conn->trans->conn_idcounter = cid; goto attempt_insertion; } @@ -291,7 +290,7 @@ id_exists: xconn = rb_entry(parent, struct rxrpc_connection, node); if (epoch < xconn->epoch || - real_conn_id < xconn->real_conn_id) + cid < xconn->cid) goto attempt_insertion; } } @@ -334,7 +333,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, */ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, struct rxrpc_transport *trans, - __be16 service_id, + u16 service_id, struct rxrpc_call *call, gfp_t gfp) { @@ -404,11 +403,11 @@ found_channel: conn->channels[chan] = call; call->conn = conn; call->channel = chan; - call->cid = conn->cid | htonl(chan); - call->call_id = htonl(++conn->call_counter); + call->cid = conn->cid | chan; + call->call_id = ++conn->call_counter; _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, ntohl(call->call_id)); + conn->debug_id, chan, call->call_id); spin_unlock(&trans->client_lock); @@ -593,11 +592,11 @@ found_channel: conn->channels[chan] = call; call->conn = conn; call->channel = chan; - call->cid = conn->cid | htonl(chan); - call->call_id = htonl(++conn->call_counter); + call->cid = conn->cid | chan; + call->call_id = ++conn->call_counter; _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, ntohl(call->call_id)); + conn->debug_id, chan, call->call_id); ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); spin_unlock(&trans->client_lock); @@ -620,21 +619,21 @@ interrupted: */ struct rxrpc_connection * rxrpc_incoming_connection(struct rxrpc_transport *trans, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, gfp_t gfp) { struct rxrpc_connection *conn, *candidate = NULL; struct rb_node *p, **pp; const char *new = "old"; __be32 epoch; - u32 conn_id; + u32 cid; _enter(""); ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); epoch = hdr->epoch; - conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + cid = hdr->cid & RXRPC_CIDMASK; /* search the connection list first */ read_lock_bh(&trans->conn_lock); @@ -643,15 +642,15 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, while (p) { conn = rb_entry(p, struct rxrpc_connection, node); - _debug("maybe %x", conn->real_conn_id); + _debug("maybe %x", conn->cid); if (epoch < conn->epoch) p = p->rb_left; else if (epoch > conn->epoch) p = p->rb_right; - else if (conn_id < conn->real_conn_id) + else if (cid < conn->cid) p = p->rb_left; - else if (conn_id > conn->real_conn_id) + else if (cid > conn->cid) p = p->rb_right; else goto found_extant_connection; @@ -668,12 +667,11 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, candidate->trans = trans; candidate->epoch = hdr->epoch; - candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK); + candidate->cid = hdr->cid & RXRPC_CIDMASK; candidate->service_id = hdr->serviceId; candidate->security_ix = hdr->securityIndex; candidate->in_clientflag = RXRPC_CLIENT_INITIATED; candidate->out_clientflag = 0; - candidate->real_conn_id = conn_id; candidate->state = RXRPC_CONN_SERVER; if (candidate->service_id) candidate->state = RXRPC_CONN_SERVER_UNSECURED; @@ -690,9 +688,9 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, pp = &(*pp)->rb_left; else if (epoch > conn->epoch) pp = &(*pp)->rb_right; - else if (conn_id < conn->real_conn_id) + else if (cid < conn->cid) pp = &(*pp)->rb_left; - else if (conn_id > conn->real_conn_id) + else if (cid > conn->cid) pp = &(*pp)->rb_right; else goto found_extant_second; @@ -714,7 +712,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, new = "new"; success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id); + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid); _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return conn; @@ -751,18 +749,17 @@ security_mismatch: * packet */ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, - struct rxrpc_header *hdr) + struct rxrpc_host_header *hdr) { struct rxrpc_connection *conn; struct rb_node *p; - __be32 epoch; - u32 conn_id; + u32 epoch, cid; - _enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags); + _enter(",{%x,%x}", hdr->cid, hdr->flags); read_lock_bh(&trans->conn_lock); - conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + cid = hdr->cid & RXRPC_CIDMASK; epoch = hdr->epoch; if (hdr->flags & RXRPC_CLIENT_INITIATED) @@ -773,15 +770,15 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, while (p) { conn = rb_entry(p, struct rxrpc_connection, node); - _debug("maybe %x", conn->real_conn_id); + _debug("maybe %x", conn->cid); if (epoch < conn->epoch) p = p->rb_left; else if (epoch > conn->epoch) p = p->rb_right; - else if (conn_id < conn->real_conn_id) + else if (cid < conn->cid) p = p->rb_left; - else if (conn_id > conn->real_conn_id) + else if (cid > conn->cid) p = p->rb_right; else goto found; diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c index e7ed43a54..1bdaaed8c 100644 --- a/net/rxrpc/ar-connevent.c +++ b/net/rxrpc/ar-connevent.c @@ -42,9 +42,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, call->state = state; call->abort_code = abort_code; if (state == RXRPC_CALL_LOCALLY_ABORTED) - set_bit(RXRPC_CALL_CONN_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); else - set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); rxrpc_queue_call(call); } write_unlock(&call->state_lock); @@ -60,11 +60,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, static int rxrpc_abort_connection(struct rxrpc_connection *conn, u32 error, u32 abort_code) { - struct rxrpc_header hdr; + struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[2]; __be32 word; size_t len; + u32 serial; int ret; _enter("%d,,%u,%u", conn->debug_id, error, abort_code); @@ -89,28 +90,29 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - hdr.epoch = conn->epoch; - hdr.cid = conn->cid; - hdr.callNumber = 0; - hdr.seq = 0; - hdr.type = RXRPC_PACKET_TYPE_ABORT; - hdr.flags = conn->out_clientflag; - hdr.userStatus = 0; - hdr.securityIndex = conn->security_ix; - hdr._rsvd = 0; - hdr.serviceId = conn->service_id; + whdr.epoch = htonl(conn->epoch); + whdr.cid = htonl(conn->cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); word = htonl(abort_code); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &word; iov[1].iov_len = sizeof(word); len = iov[0].iov_len + iov[1].iov_len; - hdr.serial = htonl(atomic_inc_return(&conn->serial)); - _proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code); + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx CONN ABORT %%%u { %d }", serial, abort_code); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -132,7 +134,7 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) if (call) { read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_SECURED, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) rxrpc_queue_call(call); read_unlock(&call->state_lock); } @@ -146,8 +148,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, u32 *_abort_code) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 tmp; - u32 serial; + __be32 wtmp; + u32 abort_code; int loop, ret; if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { @@ -155,19 +157,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return -ECONNABORTED; } - serial = ntohl(sp->hdr.serial); - - _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial); + _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0) + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) return -EPROTO; - _proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp)); + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); conn->state = RXRPC_CONN_REMOTELY_ABORTED; rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, - ntohl(tmp)); + abort_code); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -335,7 +336,7 @@ void rxrpc_reject_packets(struct work_struct *work) struct sockaddr_in sin; } sa; struct rxrpc_skb_priv *sp; - struct rxrpc_header hdr; + struct rxrpc_wire_header whdr; struct rxrpc_local *local; struct sk_buff *skb; struct msghdr msg; @@ -348,11 +349,11 @@ void rxrpc_reject_packets(struct work_struct *work) _enter("%d", local->debug_id); - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); - size = sizeof(hdr) + sizeof(code); + size = sizeof(whdr) + sizeof(code); msg.msg_name = &sa; msg.msg_control = NULL; @@ -370,8 +371,8 @@ void rxrpc_reject_packets(struct work_struct *work) break; } - memset(&hdr, 0, sizeof(hdr)); - hdr.type = RXRPC_PACKET_TYPE_ABORT; + memset(&whdr, 0, sizeof(whdr)); + whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { sp = rxrpc_skb(skb); @@ -381,13 +382,13 @@ void rxrpc_reject_packets(struct work_struct *work) sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; code = htonl(skb->priority); - hdr.epoch = sp->hdr.epoch; - hdr.cid = sp->hdr.cid; - hdr.callNumber = sp->hdr.callNumber; - hdr.serviceId = sp->hdr.serviceId; - hdr.flags = sp->hdr.flags; - hdr.flags ^= RXRPC_CLIENT_INITIATED; - hdr.flags &= RXRPC_CLIENT_INITIATED; + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; kernel_sendmsg(local->socket, &msg, iov, 2, size); break; diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 0610efa83..3e82d6f03 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -115,7 +115,6 @@ void rxrpc_UDP_error_report(struct sock *sk) /* pass the transport ref to error_handler to release */ skb_queue_tail(&trans->error_queue, skb); rxrpc_queue_work(&trans->error_handler); - _leave(""); } @@ -152,28 +151,18 @@ void rxrpc_UDP_error_handler(struct work_struct *work) switch (ee->ee_code) { case ICMP_NET_UNREACH: _net("Rx Received ICMP Network Unreachable"); - err = ENETUNREACH; break; case ICMP_HOST_UNREACH: _net("Rx Received ICMP Host Unreachable"); - err = EHOSTUNREACH; break; case ICMP_PORT_UNREACH: _net("Rx Received ICMP Port Unreachable"); - err = ECONNREFUSED; - break; - case ICMP_FRAG_NEEDED: - _net("Rx Received ICMP Fragmentation Needed (%d)", - ee->ee_info); - err = 0; /* dealt with elsewhere */ break; case ICMP_NET_UNKNOWN: _net("Rx Received ICMP Unknown Network"); - err = ENETUNREACH; break; case ICMP_HOST_UNKNOWN: _net("Rx Received ICMP Unknown Host"); - err = EHOSTUNREACH; break; default: _net("Rx Received ICMP DestUnreach code=%u", @@ -222,7 +211,7 @@ void rxrpc_UDP_error_handler(struct work_struct *work) if (call->state != RXRPC_CALL_COMPLETE && call->state < RXRPC_CALL_NETWORK_ERROR) { call->state = RXRPC_CALL_NETWORK_ERROR; - set_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); rxrpc_queue_call(call); } write_unlock(&call->state_lock); diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 4505a691d..63ed75c40 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -231,7 +231,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, _debug("drain rx oos now"); read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) rxrpc_queue_call(call); read_unlock(&call->state_lock); } @@ -287,12 +287,12 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) call->acks_latest = serial; _debug("implicit ACKALL %%%u", call->acks_latest); - set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); write_unlock_bh(&call->state_lock); if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_RESEND, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND, &call->events); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); } break; @@ -310,8 +310,8 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 _abort_code; - u32 serial, hi_serial, seq, abort_code; + __be32 wtmp; + u32 hi_serial, abort_code; _enter("%p,%p", call, skb); @@ -330,16 +330,15 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) /* track the latest serial number on this connection for ACK packet * information */ - serial = ntohl(sp->hdr.serial); hi_serial = atomic_read(&call->conn->hi_serial); - while (serial > hi_serial) + while (sp->hdr.serial > hi_serial) hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, - serial); + sp->hdr.serial); /* request ACK generation for any ACK or DATA packet that requests * it */ if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - _proto("ACK Requested on %%%u", serial); + _proto("ACK Requested on %%%u", sp->hdr.serial); rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); } @@ -347,24 +346,23 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_PACKET_TYPE_ABORT: _debug("abort"); - if (skb_copy_bits(skb, 0, &_abort_code, - sizeof(_abort_code)) < 0) + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) goto protocol_error; - abort_code = ntohl(_abort_code); - _proto("Rx ABORT %%%u { %x }", serial, abort_code); + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); write_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_REMOTELY_ABORTED; call->abort_code = abort_code; - set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); rxrpc_queue_call(call); } goto free_packet_unlock; case RXRPC_PACKET_TYPE_BUSY: - _proto("Rx BUSY %%%u", serial); + _proto("Rx BUSY %%%u", sp->hdr.serial); if (call->conn->out_clientflag) goto protocol_error; @@ -373,7 +371,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) switch (call->state) { case RXRPC_CALL_CLIENT_SEND_REQUEST: call->state = RXRPC_CALL_SERVER_BUSY; - set_bit(RXRPC_CALL_RCVD_BUSY, &call->events); + set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); rxrpc_queue_call(call); case RXRPC_CALL_SERVER_BUSY: goto free_packet_unlock; @@ -382,15 +380,13 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) } default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial); + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); goto protocol_error; case RXRPC_PACKET_TYPE_DATA: - seq = ntohl(sp->hdr.seq); + _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - _proto("Rx DATA %%%u { #%u }", serial, seq); - - if (seq == 0) + if (sp->hdr.seq == 0) goto protocol_error; call->ackr_prev_seq = sp->hdr.seq; @@ -398,9 +394,9 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) /* received data implicitly ACKs all of the request packets we * sent when we're acting as a client */ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - rxrpc_assume_implicit_ackall(call, serial); + rxrpc_assume_implicit_ackall(call, sp->hdr.serial); - switch (rxrpc_fast_process_data(call, skb, seq)) { + switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { case 0: skb = NULL; goto done; @@ -433,7 +429,7 @@ protocol_error_locked: if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } free_packet_unlock: @@ -481,12 +477,12 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, if (!pskb_pull(jumbo, sizeof(jhdr))) BUG(); - sp->hdr.seq = htonl(ntohl(sp->hdr.seq) + 1); - sp->hdr.serial = htonl(ntohl(sp->hdr.serial) + 1); + sp->hdr.seq += 1; + sp->hdr.serial += 1; sp->hdr.flags = jhdr.flags; sp->hdr._rsvd = jhdr._rsvd; - _proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1); + _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); rxrpc_fast_process_packet(call, part); part = NULL; @@ -505,7 +501,7 @@ protocol_error: if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } write_unlock_bh(&call->state_lock); @@ -530,7 +526,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, read_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) { + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_queue_call(call); goto free_unlock; } @@ -546,7 +542,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, /* resend last packet of a completed call */ _debug("final ack again"); rxrpc_get_call(call); - set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); rxrpc_queue_call(call); goto free_unlock; default: @@ -607,6 +603,35 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, rxrpc_queue_work(&local->event_processor); } +/* + * Extract the wire header from a packet and translate the byte order. + */ +static noinline +int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0) + return -EBADMSG; + if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr))) + BUG(); + + memset(sp, 0, sizeof(*sp)); + sp->hdr.epoch = ntohl(whdr.epoch); + sp->hdr.cid = ntohl(whdr.cid); + sp->hdr.callNumber = ntohl(whdr.callNumber); + sp->hdr.seq = ntohl(whdr.seq); + sp->hdr.serial = ntohl(whdr.serial); + sp->hdr.flags = whdr.flags; + sp->hdr.type = whdr.type; + sp->hdr.userStatus = whdr.userStatus; + sp->hdr.securityIndex = whdr.securityIndex; + sp->hdr._rsvd = ntohs(whdr._rsvd); + sp->hdr.serviceId = ntohs(whdr.serviceId); + return 0; +} + static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct sk_buff *skb, struct rxrpc_skb_priv *sp) @@ -686,29 +711,25 @@ void rxrpc_data_ready(struct sock *sk) UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0); - /* the socket buffer we have is owned by UDP, with UDP's data all over - * it, but we really want our own */ + /* The socket buffer we have is owned by UDP, with UDP's data all over + * it, but we really want our own data there. + */ skb_orphan(skb); sp = rxrpc_skb(skb); - memset(sp, 0, sizeof(*sp)); _net("Rx UDP packet from %08x:%04hu", ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr, - sizeof(sp->hdr)) < 0) + if (rxrpc_extract_header(sp, skb) < 0) goto bad_message; - if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr))) - BUG(); _net("Rx RxRPC %s ep=%x call=%x:%x", sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - ntohl(sp->hdr.epoch), - ntohl(sp->hdr.cid), - ntohl(sp->hdr.callNumber)); + sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) { + if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || + !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { _proto("Rx Bad Packet Type %u", sp->hdr.type); goto bad_message; } @@ -737,14 +758,9 @@ void rxrpc_data_ready(struct sock *sk) rxrpc_put_connection(conn); } else { struct rxrpc_call *call; - u8 in_clientflag = 0; - - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) - in_clientflag = RXRPC_CLIENT_INITIATED; - call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid, - sp->hdr.callNumber, sp->hdr.epoch, - sp->hdr.serviceId, local, AF_INET, - (u8 *)&ip_hdr(skb)->saddr); + + call = rxrpc_find_call_hash(&sp->hdr, local, + AF_INET, &ip_hdr(skb)->saddr); if (call) rxrpc_post_packet_to_call(call, skb); else @@ -759,7 +775,7 @@ cant_route_call: _debug("can't route call"); if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - if (sp->hdr.seq == cpu_to_be32(1)) { + if (sp->hdr.seq == 1) { _debug("first packet"); skb_queue_tail(&local->accept_queue, skb); rxrpc_queue_work(&local->acceptor); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2934a73a5..cd6cdbe87 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -16,7 +16,7 @@ BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \ (POISON_FREE << 8 | POISON_FREE)) #else -#define CHECK_SLAB_OKAY(X) do {} while(0) +#define CHECK_SLAB_OKAY(X) do {} while (0) #endif #define FCRYPT_BSIZE 8 @@ -70,12 +70,31 @@ struct rxrpc_sock { #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT struct sockaddr_rxrpc srx; /* local address */ sa_family_t proto; /* protocol created with */ - __be16 service_id; /* service ID of local/remote service */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) /* + * CPU-byteorder normalised Rx packet header. + */ +struct rxrpc_host_header { + u32 epoch; /* client boot timestamp */ + u32 cid; /* connection and channel ID */ + u32 callNumber; /* call ID (0 for connection-level packets) */ + u32 seq; /* sequence number of pkt in call stream */ + u32 serial; /* serial number of pkt sent to network */ + u8 type; /* packet type */ + u8 flags; /* packet flags */ + u8 userStatus; /* app-layer defined status */ + u8 securityIndex; /* security protocol ID */ + union { + u16 _rsvd; /* reserved */ + u16 cksum; /* kerberos security checksum */ + }; + u16 serviceId; /* service ID */ +} __packed; + +/* * RxRPC socket buffer private variables * - max 48 bytes (struct sk_buff::cb) */ @@ -89,7 +108,7 @@ struct rxrpc_skb_priv { bool need_resend; /* T if needs resending */ }; - struct rxrpc_header hdr; /* RxRPC packet header from this packet */ + struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) @@ -230,7 +249,7 @@ struct rxrpc_conn_bundle { atomic_t usage; int debug_id; /* debug ID for printks */ unsigned short num_conns; /* number of connections in this bundle */ - __be16 service_id; /* service ID */ + u16 service_id; /* Service ID for this bundle */ u8 security_ix; /* security type */ }; @@ -252,7 +271,7 @@ struct rxrpc_connection { struct rxrpc_security *security; /* applied security module */ struct key *key; /* security for this connection (client) */ struct key *server_key; /* security for this service */ - struct crypto_blkcipher *cipher; /* encryption handle */ + struct crypto_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long events; #define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ @@ -260,7 +279,6 @@ struct rxrpc_connection { rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; - u32 real_conn_id; /* connection ID (host-endian) */ enum { /* current state of connection */ RXRPC_CONN_UNUSED, /* - connection not yet attempted */ RXRPC_CONN_CLIENT, /* - client connection */ @@ -282,17 +300,76 @@ struct rxrpc_connection { u8 security_size; /* security header size */ u32 security_level; /* security level negotiated */ u32 security_nonce; /* response re-use preventer */ - - /* the following are all in net order */ - __be32 epoch; /* epoch of this connection */ - __be32 cid; /* connection ID */ - __be16 service_id; /* service ID */ + u32 epoch; /* epoch of this connection */ + u32 cid; /* connection ID */ + u16 service_id; /* service ID for this connection */ u8 security_ix; /* security type */ u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; /* + * Flags in call->flags. + */ +enum rxrpc_call_flag { + RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ + RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */ + RXRPC_CALL_RCVD_LAST, /* all packets received */ + RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */ + RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */ + RXRPC_CALL_PROC_BUSY, /* the processor is busy */ + RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */ + RXRPC_CALL_HAS_USERID, /* has a user ID attached */ + RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ +}; + +/* + * Events that can be raised on a call. + */ +enum rxrpc_call_event { + RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */ + RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */ + RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */ + RXRPC_CALL_EV_RCVD_ERROR, /* network error received */ + RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */ + RXRPC_CALL_EV_ACK, /* need to generate ACK */ + RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */ + RXRPC_CALL_EV_ABORT, /* need to generate abort */ + RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */ + RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */ + RXRPC_CALL_EV_RESEND, /* Tx resend required */ + RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */ + RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */ + RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */ + RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */ + RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */ + RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */ +}; + +/* + * The states that a call can be in. + */ +enum rxrpc_call_state { + RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ + RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ + RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ + RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ + RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ + RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ + RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ + RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ + RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ + RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ + RXRPC_CALL_COMPLETE, /* - call completed */ + RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + RXRPC_CALL_DEAD, /* - call is dead */ + NR__RXRPC_CALL_STATES +}; + +/* * RxRPC call definition * - matched by { connection, call_id } */ @@ -317,57 +394,13 @@ struct rxrpc_call { unsigned long user_call_ID; /* user-defined call ID */ unsigned long creation_jif; /* time of call creation */ unsigned long flags; -#define RXRPC_CALL_RELEASED 0 /* call has been released - no more message to userspace */ -#define RXRPC_CALL_TERMINAL_MSG 1 /* call has given the socket its final message */ -#define RXRPC_CALL_RCVD_LAST 2 /* all packets received */ -#define RXRPC_CALL_RUN_RTIMER 3 /* Tx resend timer started */ -#define RXRPC_CALL_TX_SOFT_ACK 4 /* sent some soft ACKs */ -#define RXRPC_CALL_PROC_BUSY 5 /* the processor is busy */ -#define RXRPC_CALL_INIT_ACCEPT 6 /* acceptance was initiated */ -#define RXRPC_CALL_HAS_USERID 7 /* has a user ID attached */ -#define RXRPC_CALL_EXPECT_OOS 8 /* expect out of sequence packets */ unsigned long events; -#define RXRPC_CALL_RCVD_ACKALL 0 /* ACKALL or reply received */ -#define RXRPC_CALL_RCVD_BUSY 1 /* busy packet received */ -#define RXRPC_CALL_RCVD_ABORT 2 /* abort packet received */ -#define RXRPC_CALL_RCVD_ERROR 3 /* network error received */ -#define RXRPC_CALL_ACK_FINAL 4 /* need to generate final ACK (and release call) */ -#define RXRPC_CALL_ACK 5 /* need to generate ACK */ -#define RXRPC_CALL_REJECT_BUSY 6 /* need to generate busy message */ -#define RXRPC_CALL_ABORT 7 /* need to generate abort */ -#define RXRPC_CALL_CONN_ABORT 8 /* local connection abort generated */ -#define RXRPC_CALL_RESEND_TIMER 9 /* Tx resend timer expired */ -#define RXRPC_CALL_RESEND 10 /* Tx resend required */ -#define RXRPC_CALL_DRAIN_RX_OOS 11 /* drain the Rx out of sequence queue */ -#define RXRPC_CALL_LIFE_TIMER 12 /* call's lifetimer ran out */ -#define RXRPC_CALL_ACCEPTED 13 /* incoming call accepted by userspace app */ -#define RXRPC_CALL_SECURED 14 /* incoming call's connection is now secure */ -#define RXRPC_CALL_POST_ACCEPT 15 /* need to post an "accept?" message to the app */ -#define RXRPC_CALL_RELEASE 16 /* need to release the call's resources */ - spinlock_t lock; rwlock_t state_lock; /* lock for state transition */ atomic_t usage; atomic_t sequence; /* Tx data packet sequence counter */ u32 abort_code; /* local/remote abort code */ - enum { /* current state of call */ - RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ - RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ - RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ - RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ - RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ - RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ - RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ - RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ - RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ - RXRPC_CALL_COMPLETE, /* - call completed */ - RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ - RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ - RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ - RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - RXRPC_CALL_DEAD, /* - call is dead */ - } state; + enum rxrpc_call_state state : 8; /* current state of call */ int debug_id; /* debug ID for printks */ u8 channel; /* connection channel occupied by this call */ @@ -389,9 +422,9 @@ struct rxrpc_call { rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */ rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */ rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ - rxrpc_seq_net_t ackr_prev_seq; /* previous sequence number received */ + rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ u8 ackr_reason; /* reason to ACK */ - __be32 ackr_serial; /* serial of packet being ACK'd */ + rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ atomic_t ackr_not_idle; /* number of packets in Rx queue */ /* received packet records, 1 bit per record */ @@ -403,11 +436,10 @@ struct rxrpc_call { u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ sa_family_t proto; /* Frame protocol */ - /* the following should all be in net order */ - __be32 cid; /* connection ID + channel index */ - __be32 call_id; /* call ID on connection */ - __be32 epoch; /* epoch of this connection */ - __be16 service_id; /* service ID */ + u32 call_id; /* call ID on connection */ + u32 cid; /* connection ID plus channel index */ + u32 epoch; /* epoch of this connection */ + u16 service_id; /* service ID */ union { /* Peer IP address for hashing */ __be32 ipv4_addr; __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */ @@ -423,7 +455,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) if (call->state < RXRPC_CALL_COMPLETE) { call->abort_code = abort_code; call->state = RXRPC_CALL_LOCALLY_ABORTED; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); } write_unlock_bh(&call->state_lock); } @@ -432,7 +464,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) * af_rxrpc.c */ extern atomic_t rxrpc_n_skbs; -extern __be32 rxrpc_epoch; +extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; @@ -446,35 +478,35 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * ar-ack.c */ -extern unsigned rxrpc_requested_ack_delay; -extern unsigned rxrpc_soft_ack_delay; -extern unsigned rxrpc_idle_ack_delay; -extern unsigned rxrpc_rx_window_size; -extern unsigned rxrpc_rx_mtu; -extern unsigned rxrpc_rx_jumbo_max; +extern unsigned int rxrpc_requested_ack_delay; +extern unsigned int rxrpc_soft_ack_delay; +extern unsigned int rxrpc_idle_ack_delay; +extern unsigned int rxrpc_rx_window_size; +extern unsigned int rxrpc_rx_mtu; +extern unsigned int rxrpc_rx_jumbo_max; -void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); +void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_process_call(struct work_struct *); /* * ar-call.c */ -extern unsigned rxrpc_max_call_lifetime; -extern unsigned rxrpc_dead_call_expiry; +extern unsigned int rxrpc_max_call_lifetime; +extern unsigned int rxrpc_dead_call_expiry; extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; -struct rxrpc_call *rxrpc_find_call_hash(u8, __be32, __be32, __be32, - __be16, void *, sa_family_t, const u8 *); +struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, + void *, sa_family_t, const void *); struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, unsigned long, int, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, - struct rxrpc_header *, gfp_t); + struct rxrpc_host_header *, gfp_t); struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); @@ -484,22 +516,22 @@ void __exit rxrpc_destroy_all_calls(void); /* * ar-connection.c */ -extern unsigned rxrpc_connection_expiry; +extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, struct rxrpc_transport *, - struct key *, __be16, gfp_t); + struct key *, u16, gfp_t); void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *); int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, - struct rxrpc_header *); + struct rxrpc_host_header *); extern struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *, +rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *, gfp_t); /* @@ -547,7 +579,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, /* * ar-output.c */ -extern unsigned rxrpc_resend_timeout; +extern unsigned int rxrpc_resend_timeout; int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *, @@ -595,7 +627,7 @@ void rxrpc_packet_destructor(struct sk_buff *); /* * ar-transport.c */ -extern unsigned rxrpc_transport_expiry; +extern unsigned int rxrpc_transport_expiry; struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, struct rxrpc_peer *, gfp_t); @@ -694,7 +726,7 @@ do { \ printk(KERN_ERR "RxRPC: Assertion failed\n"); \ BUG(); \ } \ -} while(0) +} while (0) #define ASSERTCMP(X, OP, Y) \ do { \ @@ -707,7 +739,7 @@ do { \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ -} while(0) +} while (0) #define ASSERTIF(C, X) \ do { \ @@ -716,7 +748,7 @@ do { \ printk(KERN_ERR "RxRPC: Assertion failed\n"); \ BUG(); \ } \ -} while(0) +} while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ @@ -729,25 +761,25 @@ do { \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ -} while(0) +} while (0) #else #define ASSERT(X) \ do { \ -} while(0) +} while (0) #define ASSERTCMP(X, OP, Y) \ do { \ -} while(0) +} while (0) #define ASSERTIF(C, X) \ do { \ -} while(0) +} while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ -} while(0) +} while (0) #endif /* __KDEBUGALL */ @@ -804,9 +836,9 @@ do { \ CHECK_SLAB_OKAY(&(CALL)->usage); \ if (atomic_inc_return(&(CALL)->usage) == 1) \ BUG(); \ -} while(0) +} while (0) #define rxrpc_put_call(CALL) \ do { \ __rxrpc_put_call(CALL); \ -} while(0) +} while (0) diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 3f6571651..3fb492eed 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -12,11 +12,11 @@ * "afs@CAMBRIDGE.REDHAT.COM> */ +#include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> -#include <linux/crypto.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> @@ -824,7 +824,7 @@ static void rxrpc_free_preparse(struct key_preparsed_payload *prep) */ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) { - struct crypto_blkcipher *ci; + struct crypto_skcipher *ci; _enter("%zu", prep->datalen); @@ -833,13 +833,13 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) memcpy(&prep->payload.data[2], prep->data, 8); - ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(ci)) { _leave(" = %ld", PTR_ERR(ci)); return PTR_ERR(ci); } - if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0) + if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) BUG(); prep->payload.data[0] = ci; @@ -853,7 +853,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { if (prep->payload.data[0]) - crypto_free_blkcipher(prep->payload.data[0]); + crypto_free_skcipher(prep->payload.data[0]); } /* @@ -870,7 +870,7 @@ static void rxrpc_destroy(struct key *key) static void rxrpc_destroy_s(struct key *key) { if (key->payload.data[0]) { - crypto_free_blkcipher(key->payload.data[0]); + crypto_free_skcipher(key->payload.data[0]); key->payload.data[0] = NULL; } } diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 78483b460..4e1e6db00 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -323,9 +323,11 @@ void __exit rxrpc_destroy_all_locals(void) * Reply to a version request */ static void rxrpc_send_version_request(struct rxrpc_local *local, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, struct sk_buff *skb) { + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sockaddr_in sin; struct msghdr msg; struct kvec iov[2]; @@ -344,15 +346,20 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, msg.msg_controllen = 0; msg.msg_flags = 0; - hdr->seq = 0; - hdr->serial = 0; - hdr->type = RXRPC_PACKET_TYPE_VERSION; - hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); - hdr->userStatus = 0; - hdr->_rsvd = 0; - - iov[0].iov_base = hdr; - iov[0].iov_len = sizeof(*hdr); + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; + whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = (char *)rxrpc_version_string; iov[1].iov_len = sizeof(rxrpc_version_string); @@ -383,7 +390,7 @@ static void rxrpc_process_local_events(struct work_struct *work) while ((skb = skb_dequeue(&local->event_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - kdebug("{%d},{%u}", local->debug_id, sp->hdr.type); + _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 14c4e12c4..d36fb6e1a 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -21,7 +21,7 @@ /* * Time till packet resend (in jiffies). */ -unsigned rxrpc_resend_timeout = 4 * HZ; +unsigned int rxrpc_resend_timeout = 4 * HZ; static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -111,11 +111,11 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; call->abort_code = abort_code; - set_bit(RXRPC_CALL_ABORT, &call->events); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); del_timer_sync(&call->resend_timer); del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); rxrpc_queue_call(call); } @@ -136,7 +136,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, struct rxrpc_call *call; unsigned long user_call_ID = 0; struct key *key; - __be16 service_id; + u16 service_id; u32 abort_code = 0; int ret; @@ -151,11 +151,11 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, bundle = NULL; if (trans) { - service_id = rx->service_id; + service_id = rx->srx.srx_service; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); - service_id = htons(srx->srx_service); + service_id = srx->srx_service; } key = rx->key; if (key && !rx->key->payload.data[0]) @@ -348,7 +348,7 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) { + if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) { down_read(&trans->local->defrag_sem); /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet @@ -401,7 +401,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, int ret; _enter(",{%d},%ld", - CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz), + CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), + call->acks_winsz), *timeo); add_wait_queue(&call->tx_waitq, &myself); @@ -409,7 +410,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, for (;;) { set_current_state(TASK_INTERRUPTIBLE); ret = 0; - if (CIRC_SPACE(call->acks_head, call->acks_tail, + if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), call->acks_winsz) > 0) break; if (signal_pending(current)) { @@ -437,7 +438,7 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call) if (try_to_del_timer_sync(&call->resend_timer) >= 0) { clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) rxrpc_queue_call(call); } read_unlock_bh(&call->state_lock); @@ -480,8 +481,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, write_unlock_bh(&call->state_lock); } - _proto("Tx DATA %%%u { #%u }", - ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); sp->need_resend = false; sp->resend_at = jiffies + rxrpc_resend_timeout; @@ -513,6 +513,29 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, } /* + * Convert a host-endian header into a network-endian header. + */ +static void rxrpc_insert_header(struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = htonl(sp->hdr.serial); + whdr.type = sp->hdr.type; + whdr.flags = sp->hdr.flags; + whdr.userStatus = sp->hdr.userStatus; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = htons(sp->hdr._rsvd); + whdr.serviceId = htons(sp->hdr.serviceId); + + memcpy(skb->head, &whdr, sizeof(whdr)); +} + +/* * send data through a socket * - must be called in process context * - caller holds the socket locked @@ -548,7 +571,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (CIRC_SPACE(call->acks_head, call->acks_tail, + if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), call->acks_winsz) <= 0) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) @@ -650,22 +674,22 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, seq = atomic_inc_return(&call->sequence); - sp->hdr.epoch = conn->epoch; - sp->hdr.cid = call->cid; + sp->hdr.epoch = conn->epoch; + sp->hdr.cid = call->cid; sp->hdr.callNumber = call->call_id; - sp->hdr.seq = htonl(seq); - sp->hdr.serial = - htonl(atomic_inc_return(&conn->serial)); - sp->hdr.type = RXRPC_PACKET_TYPE_DATA; + sp->hdr.seq = seq; + sp->hdr.serial = atomic_inc_return(&conn->serial); + sp->hdr.type = RXRPC_PACKET_TYPE_DATA; sp->hdr.userStatus = 0; sp->hdr.securityIndex = conn->security_ix; - sp->hdr._rsvd = 0; - sp->hdr.serviceId = conn->service_id; + sp->hdr._rsvd = 0; + sp->hdr.serviceId = call->service_id; sp->hdr.flags = conn->out_clientflag; if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; - else if (CIRC_SPACE(call->acks_head, call->acks_tail, + else if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), call->acks_winsz) > 1) sp->hdr.flags |= RXRPC_MORE_PACKETS; if (more && seq & 1) @@ -673,12 +697,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, ret = rxrpc_secure_packet( call, skb, skb->mark, - skb->head + sizeof(struct rxrpc_header)); + skb->head + sizeof(struct rxrpc_wire_header)); if (ret < 0) goto out; - memcpy(skb->head, &sp->hdr, - sizeof(struct rxrpc_header)); + rxrpc_insert_header(skb); rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); skb = NULL; } diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c index bebaa4348..dc089b197 100644 --- a/net/rxrpc/ar-peer.c +++ b/net/rxrpc/ar-peer.c @@ -92,7 +92,7 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, BUG(); } - peer->hdrsize += sizeof(struct rxrpc_header); + peer->hdrsize += sizeof(struct rxrpc_wire_header); peer->maxdata = peer->mtu - peer->hdrsize; } diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c index 38047f713..525b2ba5a 100644 --- a/net/rxrpc/ar-proc.c +++ b/net/rxrpc/ar-proc.c @@ -74,9 +74,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) " %-8.8s %08x %lx\n", lbuff, rbuff, - ntohs(call->conn->service_id), - ntohl(call->conn->cid), - ntohl(call->call_id), + call->conn->service_id, + call->cid, + call->call_id, call->conn->in_clientflag ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], @@ -157,8 +157,8 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) " %s %08x %08x %08x\n", lbuff, rbuff, - ntohs(conn->service_id), - ntohl(conn->cid), + conn->service_id, + conn->cid, conn->call_counter, conn->in_clientflag ? "Svc" : "Clt", atomic_read(&conn->usage), diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index b92beded7..64facba24 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -33,7 +33,7 @@ void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) read_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } @@ -158,7 +158,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto receive_non_data_message; _debug("recvmsg DATA #%u { %d, %d }", - ntohl(sp->hdr.seq), skb->len, sp->offset); + sp->hdr.seq, skb->len, sp->offset); if (!continue_call) { /* only set the control data once per recvmsg() */ @@ -169,11 +169,11 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); } - ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); - ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); - call->rx_data_recv = ntohl(sp->hdr.seq); + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; - ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); offset = sp->offset; copy = skb->len - offset; @@ -364,11 +364,11 @@ void rxrpc_kernel_data_delivered(struct sk_buff *skb) struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call = sp->call; - ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); - ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); - call->rx_data_recv = ntohl(sp->hdr.seq); + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; - ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); rxrpc_free_skb(skb); } diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c index 8334474eb..ceff6394a 100644 --- a/net/rxrpc/ar-security.c +++ b/net/rxrpc/ar-security.c @@ -167,11 +167,11 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) struct rxrpc_sock *rx; struct key *key; key_ref_t kref; - char kdesc[5+1+3+1]; + char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix); + sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); sec = rxrpc_security_lookup(conn->security_ix); if (!sec) { @@ -182,7 +182,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) /* find the service */ read_lock_bh(&local->services_lock); list_for_each_entry(rx, &local->services, listen_link) { - if (rx->service_id == conn->service_id) + if (rx->srx.srx_service == conn->service_id) goto found_service; } diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c index 4cfab49e3..62a267472 100644 --- a/net/rxrpc/ar-skbuff.c +++ b/net/rxrpc/ar-skbuff.c @@ -34,7 +34,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call) /* get an extra ref on the call for the final-ACK generator to * release */ rxrpc_get_call(call); - set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); if (try_to_del_timer_sync(&call->ack_timer) >= 0) rxrpc_queue_call(call); break; @@ -59,7 +59,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, spin_lock_bh(&call->lock); - _debug("hard ACK #%u", ntohl(sp->hdr.seq)); + _debug("hard ACK #%u", sp->hdr.seq); for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { call->ackr_window[loop] >>= 1; @@ -67,7 +67,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); } - seq = ntohl(sp->hdr.seq); + seq = sp->hdr.seq; ASSERTCMP(seq, ==, call->rx_data_eaten + 1); call->rx_data_eaten = seq; @@ -133,5 +133,4 @@ void rxrpc_kernel_free_skb(struct sk_buff *skb) { rxrpc_free_skb(skb); } - EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c index 9946467f1..66a1a5676 100644 --- a/net/rxrpc/ar-transport.c +++ b/net/rxrpc/ar-transport.c @@ -20,7 +20,7 @@ /* * Time after last use at which transport record is cleaned up. */ -unsigned rxrpc_transport_expiry = 3600 * 24; +unsigned int rxrpc_transport_expiry = 3600 * 24; static void rxrpc_transport_reaper(struct work_struct *work); @@ -51,6 +51,7 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, spin_lock_init(&trans->client_lock); rwlock_init(&trans->conn_lock); atomic_set(&trans->usage, 1); + trans->conn_idcounter = peer->srx.srx_service << 16; trans->debug_id = atomic_inc_return(&rxrpc_debug_id); if (peer->srx.transport.family == AF_INET) { diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index d7a9ab5a9..f0aeb8163 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -9,11 +9,11 @@ * 2 of the License, or (at your option) any later version. */ +#include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/udp.h> -#include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/ctype.h> #include <linux/slab.h> @@ -53,7 +53,7 @@ MODULE_LICENSE("GPL"); * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE * packets */ -static struct crypto_blkcipher *rxkad_ci; +static struct crypto_skcipher *rxkad_ci; static DEFINE_MUTEX(rxkad_ci_mutex); /* @@ -61,7 +61,7 @@ static DEFINE_MUTEX(rxkad_ci_mutex); */ static int rxkad_init_connection_security(struct rxrpc_connection *conn) { - struct crypto_blkcipher *ci; + struct crypto_skcipher *ci; struct rxrpc_key_token *token; int ret; @@ -70,15 +70,15 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) token = conn->key->payload.data[0]; conn->security_ix = token->security_index; - ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(ci)) { _debug("no cipher"); ret = PTR_ERR(ci); goto error; } - if (crypto_blkcipher_setkey(ci, token->kad->session_key, - sizeof(token->kad->session_key)) < 0) + if (crypto_skcipher_setkey(ci, token->kad->session_key, + sizeof(token->kad->session_key)) < 0) BUG(); switch (conn->security_level) { @@ -113,7 +113,7 @@ error: static void rxkad_prime_packet_security(struct rxrpc_connection *conn) { struct rxrpc_key_token *token; - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct scatterlist sg[2]; struct rxrpc_crypt iv; struct { @@ -128,21 +128,23 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn) token = conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - desc.tfm = conn->cipher; - desc.info = iv.x; - desc.flags = 0; - - tmpbuf.x[0] = conn->epoch; - tmpbuf.x[1] = conn->cid; + tmpbuf.x[0] = htonl(conn->epoch); + tmpbuf.x[1] = htonl(conn->cid); tmpbuf.x[2] = 0; tmpbuf.x[3] = htonl(conn->security_ix); sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); - crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + skcipher_request_set_tfm(req, conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); + + crypto_skcipher_encrypt(req); + skcipher_request_zero(req); memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv)); - ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]); + ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]); _leave(""); } @@ -156,7 +158,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, void *sechdr) { struct rxrpc_skb_priv *sp; - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[2]; struct { @@ -169,21 +171,24 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); - data_size |= (u32) check << 16; + check = sp->hdr.seq ^ sp->hdr.callNumber; + data_size |= (u32)check << 16; tmpbuf.hdr.data_size = htonl(data_size); memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first)); /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - desc.tfm = call->conn->cipher; - desc.info = iv.x; - desc.flags = 0; sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); - crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); + + crypto_skcipher_encrypt(req); + skcipher_request_zero(req); memcpy(sechdr, &tmpbuf, sizeof(tmpbuf)); @@ -195,81 +200,91 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, * wholly encrypt a packet (level 2 security) */ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 data_size, - void *sechdr) + struct sk_buff *skb, + u32 data_size, + void *sechdr) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr rxkhdr __attribute__((aligned(8))); /* must be all on one page */ struct rxrpc_skb_priv *sp; - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[16]; struct sk_buff *trailer; unsigned int len; u16 check; int nsg; + int err; sp = rxrpc_skb(skb); _enter(""); - check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check = sp->hdr.seq ^ sp->hdr.callNumber; - rxkhdr.data_size = htonl(data_size | (u32) check << 16); + rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - desc.tfm = call->conn->cipher; - desc.info = iv.x; - desc.flags = 0; sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr)); - crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr)); + + skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(rxkhdr), iv.x); + + crypto_skcipher_encrypt(req); /* we want to encrypt the skbuff in-place */ nsg = skb_cow_data(skb, 0, &trailer); + err = -ENOMEM; if (nsg < 0 || nsg > 16) - return -ENOMEM; + goto out; len = data_size + call->conn->size_align - 1; len &= ~(call->conn->size_align - 1); sg_init_table(sg, nsg); skb_to_sgvec(skb, sg, 0, len); - crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + + skcipher_request_set_crypt(req, sg, sg, len, iv.x); + + crypto_skcipher_encrypt(req); _leave(" = 0"); - return 0; + err = 0; + +out: + skcipher_request_zero(req); + return err; } /* * checksum an RxRPC packet header */ static int rxkad_secure_packet(const struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) + struct sk_buff *skb, + size_t data_size, + void *sechdr) { struct rxrpc_skb_priv *sp; - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[2]; struct { __be32 x[2]; } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ - __be32 x; - u32 y; + u32 x, y; int ret; sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u},%zu,", - call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq), + call->debug_id, key_serial(call->conn->key), sp->hdr.seq, data_size); if (!call->conn->cipher) @@ -281,25 +296,28 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, /* continue encrypting from where we left off */ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); - desc.tfm = call->conn->cipher; - desc.info = iv.x; - desc.flags = 0; /* calculate the security checksum */ - x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); - x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); - tmpbuf.x[0] = sp->hdr.callNumber; - tmpbuf.x[1] = x; + x = call->channel << (32 - RXRPC_CIDSHIFT); + x |= sp->hdr.seq & 0x3fffffff; + tmpbuf.x[0] = htonl(sp->hdr.callNumber); + tmpbuf.x[1] = htonl(x); sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); - crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); + + crypto_skcipher_encrypt(req); + skcipher_request_zero(req); y = ntohl(tmpbuf.x[1]); y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ - sp->hdr.cksum = htons(y); + sp->hdr.cksum = y; switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: @@ -330,7 +348,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, { struct rxkad_level1_hdr sechdr; struct rxrpc_skb_priv *sp; - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[16]; struct sk_buff *trailer; @@ -352,11 +370,13 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); - desc.tfm = call->conn->cipher; - desc.info = iv.x; - desc.flags = 0; - crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8); + skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, 8, iv.x); + + crypto_skcipher_decrypt(req); + skcipher_request_zero(req); /* remove the decrypted packet length */ if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) @@ -368,7 +388,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, data_size = buf & 0xffff; check = buf >> 16; - check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check ^= sp->hdr.seq ^ sp->hdr.callNumber; check &= 0xffff; if (check != 0) { *_abort_code = RXKADSEALEDINCON; @@ -405,7 +425,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, const struct rxrpc_key_token *token; struct rxkad_level2_hdr sechdr; struct rxrpc_skb_priv *sp; - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; struct sk_buff *trailer; @@ -435,11 +455,13 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, /* decrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - desc.tfm = call->conn->cipher; - desc.info = iv.x; - desc.flags = 0; - crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len); + skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x); + + crypto_skcipher_decrypt(req); + skcipher_request_zero(req); if (sg != _sg) kfree(sg); @@ -453,7 +475,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, data_size = buf & 0xffff; check = buf >> 16; - check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check ^= sp->hdr.seq ^ sp->hdr.callNumber; check &= 0xffff; if (check != 0) { *_abort_code = RXKADSEALEDINCON; @@ -487,23 +509,21 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb, u32 *_abort_code) { - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_skb_priv *sp; struct rxrpc_crypt iv; struct scatterlist sg[2]; struct { __be32 x[2]; } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ - __be32 x; - __be16 cksum; - u32 y; + u16 cksum; + u32 x, y; int ret; sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->key), - ntohl(sp->hdr.seq)); + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); if (!call->conn->cipher) return 0; @@ -516,26 +536,28 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, /* continue encrypting from where we left off */ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); - desc.tfm = call->conn->cipher; - desc.info = iv.x; - desc.flags = 0; /* validate the security checksum */ - x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); - x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); - tmpbuf.x[0] = call->call_id; - tmpbuf.x[1] = x; + x = call->channel << (32 - RXRPC_CIDSHIFT); + x |= sp->hdr.seq & 0x3fffffff; + tmpbuf.x[0] = htonl(call->call_id); + tmpbuf.x[1] = htonl(x); sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); - crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); + + crypto_skcipher_encrypt(req); + skcipher_request_zero(req); y = ntohl(tmpbuf.x[1]); - y = (y >> 16) & 0xffff; - if (y == 0) - y = 1; /* zero checksums are not permitted */ + cksum = (y >> 16) & 0xffff; + if (cksum == 0) + cksum = 1; /* zero checksums are not permitted */ - cksum = htons(y); if (sp->hdr.cksum != cksum) { *_abort_code = RXKADSEALEDINCON; _leave(" = -EPROTO [csum failed]"); @@ -567,10 +589,11 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, static int rxkad_issue_challenge(struct rxrpc_connection *conn) { struct rxkad_challenge challenge; - struct rxrpc_header hdr; + struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[2]; size_t len; + u32 serial; int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); @@ -592,26 +615,27 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) msg.msg_controllen = 0; msg.msg_flags = 0; - hdr.epoch = conn->epoch; - hdr.cid = conn->cid; - hdr.callNumber = 0; - hdr.seq = 0; - hdr.type = RXRPC_PACKET_TYPE_CHALLENGE; - hdr.flags = conn->out_clientflag; - hdr.userStatus = 0; - hdr.securityIndex = conn->security_ix; - hdr._rsvd = 0; - hdr.serviceId = conn->service_id; - - iov[0].iov_base = &hdr; - iov[0].iov_len = sizeof(hdr); + whdr.epoch = htonl(conn->epoch); + whdr.cid = htonl(conn->cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &challenge; iov[1].iov_len = sizeof(challenge); len = iov[0].iov_len + iov[1].iov_len; - hdr.serial = htonl(atomic_inc_return(&conn->serial)); - _proto("Tx CHALLENGE %%%u", ntohl(hdr.serial)); + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx CHALLENGE %%%u", serial); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -627,13 +651,15 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) * send a Kerberos security response */ static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_header *hdr, + struct rxrpc_host_header *hdr, struct rxkad_response *resp, const struct rxkad_key *s2) { + struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[3]; size_t len; + u32 serial; int ret; _enter(""); @@ -644,24 +670,26 @@ static int rxkad_send_response(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - hdr->epoch = conn->epoch; - hdr->seq = 0; - hdr->type = RXRPC_PACKET_TYPE_RESPONSE; - hdr->flags = conn->out_clientflag; - hdr->userStatus = 0; - hdr->_rsvd = 0; + memset(&whdr, 0, sizeof(whdr)); + whdr.epoch = htonl(hdr->epoch); + whdr.cid = htonl(hdr->cid); + whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + whdr.flags = conn->out_clientflag; + whdr.securityIndex = hdr->securityIndex; + whdr.serviceId = htons(hdr->serviceId); - iov[0].iov_base = hdr; - iov[0].iov_len = sizeof(*hdr); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); iov[1].iov_base = resp; iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *) s2->ticket; + iov[2].iov_base = (void *)s2->ticket; iov[2].iov_len = s2->ticket_len; len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - hdr->serial = htonl(atomic_inc_return(&conn->serial)); - _proto("Tx RESPONSE %%%u", ntohl(hdr->serial)); + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx RESPONSE %%%u", serial); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len); if (ret < 0) { @@ -718,18 +746,21 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxkad_key *s2) { - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[2]; /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - desc.tfm = conn->cipher; - desc.info = iv.x; - desc.flags = 0; rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); - crypto_blkcipher_encrypt_iv(&desc, sg, sg, sizeof(resp->encrypted)); + + skcipher_request_set_tfm(req, conn->cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); + + crypto_skcipher_encrypt(req); + skcipher_request_zero(req); } /* @@ -770,7 +801,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, min_level = ntohl(challenge.min_level); _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }", - ntohl(sp->hdr.serial), version, nonce, min_level); + sp->hdr.serial, version, nonce, min_level); abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) @@ -785,22 +816,23 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, /* build the response packet */ memset(&resp, 0, sizeof(resp)); - resp.version = RXKAD_VERSION; - resp.encrypted.epoch = conn->epoch; - resp.encrypted.cid = conn->cid; - resp.encrypted.securityIndex = htonl(conn->security_ix); + resp.version = htonl(RXKAD_VERSION); + resp.encrypted.epoch = htonl(conn->epoch); + resp.encrypted.cid = htonl(conn->cid); + resp.encrypted.securityIndex = htonl(conn->security_ix); + resp.encrypted.inc_nonce = htonl(nonce + 1); + resp.encrypted.level = htonl(conn->security_level); + resp.kvno = htonl(token->kad->kvno); + resp.ticket_len = htonl(token->kad->ticket_len); + resp.encrypted.call_id[0] = - (conn->channels[0] ? conn->channels[0]->call_id : 0); + htonl(conn->channels[0] ? conn->channels[0]->call_id : 0); resp.encrypted.call_id[1] = - (conn->channels[1] ? conn->channels[1]->call_id : 0); + htonl(conn->channels[1] ? conn->channels[1]->call_id : 0); resp.encrypted.call_id[2] = - (conn->channels[2] ? conn->channels[2]->call_id : 0); + htonl(conn->channels[2] ? conn->channels[2]->call_id : 0); resp.encrypted.call_id[3] = - (conn->channels[3] ? conn->channels[3]->call_id : 0); - resp.encrypted.inc_nonce = htonl(nonce + 1); - resp.encrypted.level = htonl(conn->security_level); - resp.kvno = htonl(token->kad->kvno); - resp.ticket_len = htonl(token->kad->ticket_len); + htonl(conn->channels[3] ? conn->channels[3]->call_id : 0); /* calculate the response checksum and then do the encryption */ rxkad_calc_response_checksum(&resp); @@ -822,7 +854,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, time_t *_expiry, u32 *_abort_code) { - struct blkcipher_desc desc; + struct skcipher_request *req; struct rxrpc_crypt iv, key; struct scatterlist sg[1]; struct in_addr addr; @@ -853,12 +885,21 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv)); - desc.tfm = conn->server_key->payload.data[0]; - desc.info = iv.x; - desc.flags = 0; + req = skcipher_request_alloc(conn->server_key->payload.data[0], + GFP_NOFS); + if (!req) { + *_abort_code = RXKADNOAUTH; + ret = -ENOMEM; + goto error; + } sg_init_one(&sg[0], ticket, ticket_len); - crypto_blkcipher_decrypt_iv(&desc, sg, sg, ticket_len); + + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x); + + crypto_skcipher_decrypt(req); + skcipher_request_free(req); p = ticket; end = p + ticket_len; @@ -966,7 +1007,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxrpc_crypt *session_key) { - struct blkcipher_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); struct scatterlist sg[2]; struct rxrpc_crypt iv; @@ -976,17 +1017,21 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, ASSERT(rxkad_ci != NULL); mutex_lock(&rxkad_ci_mutex); - if (crypto_blkcipher_setkey(rxkad_ci, session_key->x, - sizeof(*session_key)) < 0) + if (crypto_skcipher_setkey(rxkad_ci, session_key->x, + sizeof(*session_key)) < 0) BUG(); memcpy(&iv, session_key, sizeof(iv)); - desc.tfm = rxkad_ci; - desc.info = iv.x; - desc.flags = 0; rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); - crypto_blkcipher_decrypt_iv(&desc, sg, sg, sizeof(resp->encrypted)); + + skcipher_request_set_tfm(req, rxkad_ci); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); + + crypto_skcipher_decrypt(req); + skcipher_request_zero(req); + mutex_unlock(&rxkad_ci_mutex); _leave(""); @@ -1022,7 +1067,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, kvno = ntohl(response.kvno); sp = rxrpc_skb(skb); _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", - ntohl(sp->hdr.serial), version, kvno, ticket_len); + sp->hdr.serial, version, kvno, ticket_len); abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) @@ -1058,9 +1103,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, rxkad_decrypt_response(conn, &response, &session_key); abort_code = RXKADSEALEDINCON; - if (response.encrypted.epoch != conn->epoch) + if (ntohl(response.encrypted.epoch) != conn->epoch) goto protocol_error_free; - if (response.encrypted.cid != conn->cid) + if (ntohl(response.encrypted.cid) != conn->cid) goto protocol_error_free; if (ntohl(response.encrypted.securityIndex) != conn->security_ix) goto protocol_error_free; @@ -1077,7 +1122,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, goto protocol_error_free; abort_code = RXKADOUTOFSEQUENCE; - if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1)) + if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1) goto protocol_error_free; abort_code = RXKADLEVELFAIL; @@ -1115,7 +1160,7 @@ static void rxkad_clear(struct rxrpc_connection *conn) _enter(""); if (conn->cipher) - crypto_free_blkcipher(conn->cipher); + crypto_free_skcipher(conn->cipher); } /* @@ -1141,7 +1186,7 @@ static __init int rxkad_init(void) /* pin the cipher we need so that the crypto layer doesn't invoke * keventd to go get it */ - rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(rxkad_ci)) return PTR_ERR(rxkad_ci); @@ -1155,7 +1200,7 @@ static __exit void rxkad_exit(void) _enter(""); rxrpc_unregister_security(&rxkad); - crypto_free_blkcipher(rxkad_ci); + crypto_free_skcipher(rxkad_ci); } module_exit(rxkad_exit); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 50a98a910..d20ed575a 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -15,11 +15,11 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; -static const unsigned zero = 0; -static const unsigned one = 1; -static const unsigned four = 4; -static const unsigned n_65535 = 65535; -static const unsigned n_max_acks = RXRPC_MAXACKS; +static const unsigned int zero = 0; +static const unsigned int one = 1; +static const unsigned int four = 4; +static const unsigned int n_65535 = 65535; +static const unsigned int n_max_acks = RXRPC_MAXACKS; /* * RxRPC operating parameters. @@ -32,7 +32,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "req_ack_delay", .data = &rxrpc_requested_ack_delay, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&zero, @@ -40,7 +40,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, @@ -48,7 +48,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "idle_ack_delay", .data = &rxrpc_idle_ack_delay, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, @@ -56,7 +56,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "resend_timeout", .data = &rxrpc_resend_timeout, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, @@ -66,7 +66,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "max_call_lifetime", .data = &rxrpc_max_call_lifetime, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, .extra1 = (void *)&one, @@ -74,7 +74,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "dead_call_expiry", .data = &rxrpc_dead_call_expiry, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, .extra1 = (void *)&one, @@ -84,7 +84,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "connection_expiry", .data = &rxrpc_connection_expiry, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, @@ -92,7 +92,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "transport_expiry", .data = &rxrpc_transport_expiry, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, @@ -102,7 +102,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "rx_window_size", .data = &rxrpc_rx_window_size, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, @@ -111,16 +111,16 @@ static struct ctl_table rxrpc_sysctl_table[] = { { .procname = "rx_mtu", .data = &rxrpc_rx_mtu, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, - .extra1 = (void *)&n_65535, + .extra2 = (void *)&n_65535, }, { .procname = "rx_jumbo_max", .data = &rxrpc_rx_jumbo_max, - .maxlen = sizeof(unsigned), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 82830824f..b148302bb 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -739,6 +739,28 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_IFE + tristate "Inter-FE action based on IETF ForCES InterFE LFB" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow for sourcing and terminating metadata + For details refer to netdev01 paper: + "Distributing Linux Traffic Control Classifier-Action Subsystem" + Authors: Jamal Hadi Salim and Damascene M. Joachimpillai + + To compile this code as a module, choose M here: the + module will be called act_ife. + +config NET_IFE_SKBMARK + tristate "Support to encoding decoding skb mark on IFE action" + depends on NET_ACT_IFE + ---help--- + +config NET_IFE_SKBPRIO + tristate "Support to encoding decoding skb prio on IFE action" + depends on NET_ACT_IFE + ---help--- + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index 690c1689e..84bddb373 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -19,6 +19,9 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_IFE) += act_ife.o +obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o +obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 06e7c4a37..96066665e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,10 +36,9 @@ static void free_tcf(struct rcu_head *head) kfree(p); } -static void tcf_hash_destroy(struct tc_action *a) +static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) { struct tcf_common *p = a->priv; - struct tcf_hashinfo *hinfo = a->ops->hinfo; spin_lock_bh(&hinfo->lock); hlist_del(&p->tcfc_head); @@ -68,8 +67,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { if (a->ops->cleanup) a->ops->cleanup(a, bind); - tcf_hash_destroy(a); - ret = 1; + tcf_hash_destroy(a->hinfo, a); + ret = ACT_P_DELETED; } } @@ -77,10 +76,9 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) } EXPORT_SYMBOL(__tcf_hash_release); -static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, - struct tc_action *a) +static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, + struct netlink_callback *cb, struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; @@ -126,9 +124,9 @@ nla_put_failure: goto done; } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) +static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, + struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct hlist_head *head; struct hlist_node *n; struct tcf_common *p; @@ -163,18 +161,24 @@ nla_put_failure: return ret; } -static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) { + struct tcf_hashinfo *hinfo = tn->hinfo; + + a->hinfo = hinfo; + if (type == RTM_DELACTION) { - return tcf_del_walker(skb, a); + return tcf_del_walker(hinfo, skb, a); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(skb, cb, a); + return tcf_dump_walker(hinfo, skb, cb, a); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; } } +EXPORT_SYMBOL(tcf_generic_walker); static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { @@ -191,8 +195,9 @@ static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) return p; } -u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tc_action_net *tn) { + struct tcf_hashinfo *hinfo = tn->hinfo; u32 val = hinfo->index; do { @@ -205,28 +210,31 @@ u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) } EXPORT_SYMBOL(tcf_hash_new_index); -int tcf_hash_search(struct tc_action *a, u32 index) +int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = tcf_hash_lookup(index, hinfo); if (p) { a->priv = p; + a->hinfo = hinfo; return 1; } return 0; } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(u32 index, struct tc_action *a, int bind) +int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = NULL; if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; a->priv = p; + a->hinfo = hinfo; return 1; } return 0; @@ -243,11 +251,11 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) } EXPORT_SYMBOL(tcf_hash_cleanup); -int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind, bool cpustats) +int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + struct tc_action *a, int size, int bind, bool cpustats) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; if (unlikely(!p)) @@ -272,7 +280,7 @@ err2: } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); - p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); + p->tcfc_index = index ? index : tcf_hash_new_index(tn); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { @@ -286,14 +294,15 @@ err2: } a->priv = (void *) p; + a->hinfo = hinfo; return 0; } EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tc_action *a) +void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_common *p = a->priv; - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); spin_lock_bh(&hinfo->lock); @@ -302,59 +311,78 @@ void tcf_hash_insert(struct tc_action *a) } EXPORT_SYMBOL(tcf_hash_insert); +void tcf_hashinfo_destroy(const struct tc_action_ops *ops, + struct tcf_hashinfo *hinfo) +{ + struct tc_action a = { + .ops = ops, + .hinfo = hinfo, + }; + int i; + + for (i = 0; i < hinfo->hmask + 1; i++) { + struct tcf_common *p; + struct hlist_node *n; + + hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { + int ret; + + a.priv = p; + ret = __tcf_hash_release(&a, false, true); + if (ret == ACT_P_DELETED) + module_put(ops->owner); + else if (ret < 0) + return; + } + } + kfree(hinfo->htab); +} +EXPORT_SYMBOL(tcf_hashinfo_destroy); + static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act, unsigned int mask) +int tcf_register_action(struct tc_action_ops *act, + struct pernet_operations *ops) { struct tc_action_ops *a; - int err; + int ret; - /* Must supply act, dump and init */ - if (!act->act || !act->dump || !act->init) + if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) return -EINVAL; - /* Supply defaults */ - if (!act->lookup) - act->lookup = tcf_hash_search; - if (!act->walk) - act->walk = tcf_generic_walker; - - act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); - if (!act->hinfo) - return -ENOMEM; - err = tcf_hashinfo_init(act->hinfo, mask); - if (err) { - kfree(act->hinfo); - return err; - } - write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); - tcf_hashinfo_destroy(act->hinfo); - kfree(act->hinfo); return -EEXIST; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); + + ret = register_pernet_subsys(ops); + if (ret) { + tcf_unregister_action(act, ops); + return ret; + } + return 0; } EXPORT_SYMBOL(tcf_register_action); -int tcf_unregister_action(struct tc_action_ops *act) +int tcf_unregister_action(struct tc_action_ops *act, + struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; + unregister_pernet_subsys(ops); + write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); - tcf_hashinfo_destroy(act->hinfo); - kfree(act->hinfo); err = 0; break; } @@ -721,8 +749,8 @@ static struct tc_action *create_a(int i) return act; } -static struct tc_action * -tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) +static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; @@ -749,7 +777,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) if (a->ops == NULL) /* could happen in batch of actions */ goto err_free; err = -ENOENT; - if (a->ops->lookup(a, index) == 0) + if (a->ops->lookup(net, a, index) == 0) goto err_mod; module_put(a->ops->owner); @@ -819,7 +847,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a); + err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a); if (err < 0) goto out_module_put; if (err == 0) @@ -897,7 +925,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, portid); + act = tcf_action_get_1(net, tb[i], n, portid); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; @@ -1044,6 +1072,7 @@ find_dump_kind(const struct nlmsghdr *n) static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -1078,7 +1107,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(skb, cb, RTM_GETACTION, &a); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 0bc6f912f..8c9f1f045 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -33,6 +33,8 @@ struct tcf_bpf_cfg { bool is_ebpf; }; +static int bpf_net_id; + static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { @@ -275,6 +277,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *act, int replace, int bind) { + struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; @@ -294,8 +297,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_hash_check(parm->index, act, bind)) { - ret = tcf_hash_create(parm->index, est, act, + if (!tcf_hash_check(tn, parm->index, act, bind)) { + ret = tcf_hash_create(tn, parm->index, est, act, sizeof(*prog), bind, true); if (ret < 0) return ret; @@ -344,7 +347,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(prog->filter, cfg.filter); if (res == ACT_P_CREATED) { - tcf_hash_insert(act); + tcf_hash_insert(tn, act); } else { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); @@ -367,6 +370,22 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind) tcf_bpf_cfg_cleanup(&tmp); } +static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -375,16 +394,39 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, + .walk = tcf_bpf_walker, + .lookup = tcf_bpf_search, +}; + +static __net_init int bpf_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK); +} + +static void __net_exit bpf_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations bpf_net_ops = { + .init = bpf_init_net, + .exit = bpf_exit_net, + .id = &bpf_net_id, + .size = sizeof(struct tc_action_net), }; static int __init bpf_init_module(void) { - return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK); + return tcf_register_action(&act_bpf_ops, &bpf_net_ops); } static void __exit bpf_cleanup_module(void) { - tcf_unregister_action(&act_bpf_ops); + tcf_unregister_action(&act_bpf_ops, &bpf_net_ops); } module_init(bpf_init_module); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index bb41699c6..c0ed93ce2 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -30,6 +30,8 @@ #define CONNMARK_TAB_MASK 3 +static int connmark_net_id; + static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -97,6 +99,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; struct tcf_connmark_info *ci; struct tc_connmark *parm; @@ -111,9 +114,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*ci), bind, false); if (ret) return ret; @@ -122,7 +125,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci->net = net; ci->zone = parm->zone; - tcf_hash_insert(a); + tcf_hash_insert(tn, a); ret = ACT_P_CREATED; } else { ci = to_connmark(a); @@ -169,6 +172,22 @@ nla_put_failure: return -1; } +static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -176,16 +195,39 @@ static struct tc_action_ops act_connmark_ops = { .act = tcf_connmark, .dump = tcf_connmark_dump, .init = tcf_connmark_init, + .walk = tcf_connmark_walker, + .lookup = tcf_connmark_search, +}; + +static __net_init int connmark_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK); +} + +static void __net_exit connmark_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations connmark_net_ops = { + .init = connmark_init_net, + .exit = connmark_exit_net, + .id = &connmark_net_id, + .size = sizeof(struct tc_action_net), }; static int __init connmark_init_module(void) { - return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK); + return tcf_register_action(&act_connmark_ops, &connmark_net_ops); } static void __exit connmark_cleanup_module(void) { - tcf_unregister_action(&act_connmark_ops); + tcf_unregister_action(&act_connmark_ops, &connmark_net_ops); } module_init(connmark_init_module); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b07c535ba..d22426cde 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -42,9 +42,13 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int csum_net_id; + +static int tcf_csum_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { + struct tc_action_net *tn = net_generic(net, csum_net_id); struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -61,9 +65,9 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -82,7 +86,7 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -105,9 +109,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, int hl = ihl + jhl; if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || - (skb_cloned(skb) && - !skb_clone_writable(skb, hl + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, hl + ntkoff)) return NULL; else return (void *)(skb_network_header(skb) + ihl); @@ -365,9 +367,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff)) goto fail; ip_send_check(ip_hdr(skb)); @@ -559,6 +559,22 @@ nla_put_failure: return -1; } +static int tcf_csum_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -566,6 +582,29 @@ static struct tc_action_ops act_csum_ops = { .act = tcf_csum, .dump = tcf_csum_dump, .init = tcf_csum_init, + .walk = tcf_csum_walker, + .lookup = tcf_csum_search, +}; + +static __net_init int csum_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK); +} + +static void __net_exit csum_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations csum_net_ops = { + .init = csum_init_net, + .exit = csum_exit_net, + .id = &csum_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Checksum updating actions"); @@ -573,12 +612,12 @@ MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { - return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK); + return tcf_register_action(&act_csum_ops, &csum_net_ops); } static void __exit csum_cleanup_module(void) { - tcf_unregister_action(&act_csum_ops); + tcf_unregister_action(&act_csum_ops, &csum_net_ops); } module_init(csum_init_module); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 5c1b05170..887fc1f20 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -25,6 +25,8 @@ #define GACT_TAB_MASK 15 +static int gact_net_id; + #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { @@ -57,6 +59,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; struct tcf_gact *gact; @@ -88,9 +91,9 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), - bind, true); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*gact), bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -118,7 +121,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -183,6 +186,22 @@ nla_put_failure: return -1; } +static int tcf_gact_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -190,6 +209,29 @@ static struct tc_action_ops act_gact_ops = { .act = tcf_gact, .dump = tcf_gact_dump, .init = tcf_gact_init, + .walk = tcf_gact_walker, + .lookup = tcf_gact_search, +}; + +static __net_init int gact_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK); +} + +static void __net_exit gact_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations gact_net_ops = { + .init = gact_init_net, + .exit = gact_exit_net, + .id = &gact_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -203,12 +245,13 @@ static int __init gact_init_module(void) #else pr_info("GACT probability NOT on\n"); #endif - return tcf_register_action(&act_gact_ops, GACT_TAB_MASK); + + return tcf_register_action(&act_gact_ops, &gact_net_ops); } static void __exit gact_cleanup_module(void) { - tcf_unregister_action(&act_gact_ops); + tcf_unregister_action(&act_gact_ops, &gact_net_ops); } module_init(gact_init_module); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c new file mode 100644 index 000000000..343d011aa --- /dev/null +++ b/net/sched/act_ife.c @@ -0,0 +1,876 @@ +/* + * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB + * + * Refer to: + * draft-ietf-forces-interfelfb-03 + * and + * netdev01 paper: + * "Distributing Linux Traffic Control Classifier-Action + * Subsystem" + * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2015) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> +#include <linux/etherdevice.h> + +#define IFE_TAB_MASK 15 + +static int ife_net_id; +static int max_metacnt = IFE_META_MAX + 1; + +static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { + [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, + [TCA_IFE_DMAC] = { .len = ETH_ALEN}, + [TCA_IFE_SMAC] = { .len = ETH_ALEN}, + [TCA_IFE_TYPE] = { .type = NLA_U16}, +}; + +/* Caller takes care of presenting data in network order +*/ +int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) +{ + u32 *tlv = (u32 *)(skbdata); + u16 totlen = nla_total_size(dlen); /*alignment + hdr */ + char *dptr = (char *)tlv + NLA_HDRLEN; + u32 htlv = attrtype << 16 | totlen; + + *tlv = htonl(htlv); + memset(dptr, 0, totlen - NLA_HDRLEN); + memcpy(dptr, dval, dlen); + + return totlen; +} +EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); + +int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) +{ + if (mi->metaval) + return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval); + else + return nla_put(skb, mi->metaid, 0, NULL); +} +EXPORT_SYMBOL_GPL(ife_get_meta_u32); + +int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) +{ + if (metaval || mi->metaval) + return 8; /* T+L+V == 2+2+4 */ + + return 0; +} +EXPORT_SYMBOL_GPL(ife_check_meta_u32); + +int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) +{ + u32 edata = metaval; + + if (mi->metaval) + edata = *(u32 *)mi->metaval; + else if (metaval) + edata = metaval; + + if (!edata) /* will not encode */ + return 0; + + edata = htonl(edata); + return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata); +} +EXPORT_SYMBOL_GPL(ife_encode_meta_u32); + +int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) +{ + if (mi->metaval) + return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval); + else + return nla_put(skb, mi->metaid, 0, NULL); +} +EXPORT_SYMBOL_GPL(ife_get_meta_u16); + +int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval) +{ + mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL); + if (!mi->metaval) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); + +int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval) +{ + mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL); + if (!mi->metaval) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(ife_alloc_meta_u16); + +void ife_release_meta_gen(struct tcf_meta_info *mi) +{ + kfree(mi->metaval); +} +EXPORT_SYMBOL_GPL(ife_release_meta_gen); + +int ife_validate_meta_u32(void *val, int len) +{ + if (len == 4) + return 0; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ife_validate_meta_u32); + +int ife_validate_meta_u16(void *val, int len) +{ + /* length will include padding */ + if (len == NLA_ALIGN(2)) + return 0; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ife_validate_meta_u16); + +static LIST_HEAD(ifeoplist); +static DEFINE_RWLOCK(ife_mod_lock); + +static struct tcf_meta_ops *find_ife_oplist(u16 metaid) +{ + struct tcf_meta_ops *o; + + read_lock(&ife_mod_lock); + list_for_each_entry(o, &ifeoplist, list) { + if (o->metaid == metaid) { + if (!try_module_get(o->owner)) + o = NULL; + read_unlock(&ife_mod_lock); + return o; + } + } + read_unlock(&ife_mod_lock); + + return NULL; +} + +int register_ife_op(struct tcf_meta_ops *mops) +{ + struct tcf_meta_ops *m; + + if (!mops->metaid || !mops->metatype || !mops->name || + !mops->check_presence || !mops->encode || !mops->decode || + !mops->get || !mops->alloc) + return -EINVAL; + + write_lock(&ife_mod_lock); + + list_for_each_entry(m, &ifeoplist, list) { + if (m->metaid == mops->metaid || + (strcmp(mops->name, m->name) == 0)) { + write_unlock(&ife_mod_lock); + return -EEXIST; + } + } + + if (!mops->release) + mops->release = ife_release_meta_gen; + + list_add_tail(&mops->list, &ifeoplist); + write_unlock(&ife_mod_lock); + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ife_op); + +int unregister_ife_op(struct tcf_meta_ops *mops) +{ + struct tcf_meta_ops *m; + int err = -ENOENT; + + write_lock(&ife_mod_lock); + list_for_each_entry(m, &ifeoplist, list) { + if (m->metaid == mops->metaid) { + list_del(&mops->list); + err = 0; + break; + } + } + write_unlock(&ife_mod_lock); + + return err; +} +EXPORT_SYMBOL_GPL(register_ife_op); + +static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) +{ + int ret = 0; + /* XXX: unfortunately cant use nla_policy at this point + * because a length of 0 is valid in the case of + * "allow". "use" semantics do enforce for proper + * length and i couldve use nla_policy but it makes it hard + * to use it just for that.. + */ + if (ops->validate) + return ops->validate(val, len); + + if (ops->metatype == NLA_U32) + ret = ife_validate_meta_u32(val, len); + else if (ops->metatype == NLA_U16) + ret = ife_validate_meta_u16(val, len); + + return ret; +} + +/* called when adding new meta information + * under ife->tcf_lock +*/ +static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, + void *val, int len) +{ + struct tcf_meta_ops *ops = find_ife_oplist(metaid); + int ret = 0; + + if (!ops) { + ret = -ENOENT; +#ifdef CONFIG_MODULES + spin_unlock_bh(&ife->tcf_lock); + rtnl_unlock(); + request_module("ifemeta%u", metaid); + rtnl_lock(); + spin_lock_bh(&ife->tcf_lock); + ops = find_ife_oplist(metaid); +#endif + } + + if (ops) { + ret = 0; + if (len) + ret = ife_validate_metatype(ops, val, len); + + module_put(ops->owner); + } + + return ret; +} + +/* called when adding new meta information + * under ife->tcf_lock +*/ +static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, + int len) +{ + struct tcf_meta_info *mi = NULL; + struct tcf_meta_ops *ops = find_ife_oplist(metaid); + int ret = 0; + + if (!ops) + return -ENOENT; + + mi = kzalloc(sizeof(*mi), GFP_KERNEL); + if (!mi) { + /*put back what find_ife_oplist took */ + module_put(ops->owner); + return -ENOMEM; + } + + mi->metaid = metaid; + mi->ops = ops; + if (len > 0) { + ret = ops->alloc(mi, metaval); + if (ret != 0) { + kfree(mi); + module_put(ops->owner); + return ret; + } + } + + list_add_tail(&mi->metalist, &ife->metalist); + + return ret; +} + +static int use_all_metadata(struct tcf_ife_info *ife) +{ + struct tcf_meta_ops *o; + int rc = 0; + int installed = 0; + + list_for_each_entry(o, &ifeoplist, list) { + rc = add_metainfo(ife, o->metaid, NULL, 0); + if (rc == 0) + installed += 1; + } + + if (installed) + return 0; + else + return -EINVAL; +} + +static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) +{ + struct tcf_meta_info *e; + struct nlattr *nest; + unsigned char *b = skb_tail_pointer(skb); + int total_encoded = 0; + + /*can only happen on decode */ + if (list_empty(&ife->metalist)) + return 0; + + nest = nla_nest_start(skb, TCA_IFE_METALST); + if (!nest) + goto out_nlmsg_trim; + + list_for_each_entry(e, &ife->metalist, metalist) { + if (!e->ops->get(skb, e)) + total_encoded += 1; + } + + if (!total_encoded) + goto out_nlmsg_trim; + + nla_nest_end(skb, nest); + + return 0; + +out_nlmsg_trim: + nlmsg_trim(skb, b); + return -1; +} + +/* under ife->tcf_lock */ +static void _tcf_ife_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ife_info *ife = a->priv; + struct tcf_meta_info *e, *n; + + list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + module_put(e->ops->owner); + list_del(&e->metalist); + if (e->metaval) { + if (e->ops->release) + e->ops->release(e); + else + kfree(e->metaval); + } + kfree(e); + } +} + +static void tcf_ife_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ife_info *ife = a->priv; + + spin_lock_bh(&ife->tcf_lock); + _tcf_ife_cleanup(a, bind); + spin_unlock_bh(&ife->tcf_lock); +} + +/* under ife->tcf_lock */ +static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb) +{ + int len = 0; + int rc = 0; + int i = 0; + void *val; + + for (i = 1; i < max_metacnt; i++) { + if (tb[i]) { + val = nla_data(tb[i]); + len = nla_len(tb[i]); + + rc = load_metaops_and_vet(ife, i, val, len); + if (rc != 0) + return rc; + + rc = add_metainfo(ife, i, val, len); + if (rc) + return rc; + } + } + + return rc; +} + +static int tcf_ife_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + struct nlattr *tb[TCA_IFE_MAX + 1]; + struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_ife_info *ife; + struct tc_ife *parm; + u16 ife_type = 0; + u8 *daddr = NULL; + u8 *saddr = NULL; + int ret = 0, exists = 0; + int err; + + err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); + if (err < 0) + return err; + + if (!tb[TCA_IFE_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_IFE_PARMS]); + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (parm->flags & IFE_ENCODE) { + /* Until we get issued the ethertype, we cant have + * a default.. + **/ + if (!tb[TCA_IFE_TYPE]) { + if (exists) + tcf_hash_release(a, bind); + pr_info("You MUST pass etherype for encoding\n"); + return -EINVAL; + } + } + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife), + bind, false); + if (ret) + return ret; + ret = ACT_P_CREATED; + } else { + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + } + + ife = to_ife(a); + ife->flags = parm->flags; + + if (parm->flags & IFE_ENCODE) { + ife_type = nla_get_u16(tb[TCA_IFE_TYPE]); + if (tb[TCA_IFE_DMAC]) + daddr = nla_data(tb[TCA_IFE_DMAC]); + if (tb[TCA_IFE_SMAC]) + saddr = nla_data(tb[TCA_IFE_SMAC]); + } + + spin_lock_bh(&ife->tcf_lock); + ife->tcf_action = parm->action; + + if (parm->flags & IFE_ENCODE) { + if (daddr) + ether_addr_copy(ife->eth_dst, daddr); + else + eth_zero_addr(ife->eth_dst); + + if (saddr) + ether_addr_copy(ife->eth_src, saddr); + else + eth_zero_addr(ife->eth_src); + + ife->eth_type = ife_type; + } + + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&ife->metalist); + + if (tb[TCA_IFE_METALST]) { + err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], + NULL); + if (err) { +metadata_parse_err: + if (exists) + tcf_hash_release(a, bind); + if (ret == ACT_P_CREATED) + _tcf_ife_cleanup(a, bind); + + spin_unlock_bh(&ife->tcf_lock); + return err; + } + + err = populate_metalist(ife, tb2); + if (err) + goto metadata_parse_err; + + } else { + /* if no passed metadata allow list or passed allow-all + * then here we process by adding as many supported metadatum + * as we can. You better have at least one else we are + * going to bail out + */ + err = use_all_metadata(ife); + if (err) { + if (ret == ACT_P_CREATED) + _tcf_ife_cleanup(a, bind); + + spin_unlock_bh(&ife->tcf_lock); + return err; + } + } + + spin_unlock_bh(&ife->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, a); + + return ret; +} + +static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ife_info *ife = a->priv; + struct tc_ife opt = { + .index = ife->tcf_index, + .refcnt = ife->tcf_refcnt - ref, + .bindcnt = ife->tcf_bindcnt - bind, + .action = ife->tcf_action, + .flags = ife->flags, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); + if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t)) + goto nla_put_failure; + + if (!is_zero_ether_addr(ife->eth_dst)) { + if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst)) + goto nla_put_failure; + } + + if (!is_zero_ether_addr(ife->eth_src)) { + if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src)) + goto nla_put_failure; + } + + if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type)) + goto nla_put_failure; + + if (dump_metalist(skb, ife)) { + /*ignore failure to dump metalist */ + pr_info("Failed to dump metalist\n"); + } + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, + u16 metaid, u16 mlen, void *mdata) +{ + struct tcf_meta_info *e; + + /* XXX: use hash to speed up */ + list_for_each_entry(e, &ife->metalist, metalist) { + if (metaid == e->metaid) { + if (e->ops) { + /* We check for decode presence already */ + return e->ops->decode(skb, mdata, mlen); + } + } + } + + return 0; +} + +struct ifeheadr { + __be16 metalen; + u8 tlv_data[]; +}; + +struct meta_tlvhdr { + __be16 type; + __be16 len; +}; + +static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_ife_info *ife = a->priv; + int action = ife->tcf_action; + struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; + u16 ifehdrln = ifehdr->metalen; + struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); + + spin_lock(&ife->tcf_lock); + bstats_update(&ife->tcf_bstats, skb); + ife->tcf_tm.lastuse = jiffies; + spin_unlock(&ife->tcf_lock); + + ifehdrln = ntohs(ifehdrln); + if (unlikely(!pskb_may_pull(skb, ifehdrln))) { + spin_lock(&ife->tcf_lock); + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + + skb_set_mac_header(skb, ifehdrln); + __skb_pull(skb, ifehdrln); + skb->protocol = eth_type_trans(skb, skb->dev); + ifehdrln -= IFE_METAHDRLEN; + + while (ifehdrln > 0) { + u8 *tlvdata = (u8 *)tlv; + u16 mtype = tlv->type; + u16 mlen = tlv->len; + + mtype = ntohs(mtype); + mlen = ntohs(mlen); + + if (find_decode_metaid(skb, ife, mtype, (mlen - 4), + (void *)(tlvdata + 4))) { + /* abuse overlimits to count when we receive metadata + * but dont have an ops for it + */ + pr_info_ratelimited("Unknown metaid %d alnlen %d\n", + mtype, mlen); + ife->tcf_qstats.overlimits++; + } + + tlvdata += mlen; + ifehdrln -= mlen; + tlv = (struct meta_tlvhdr *)tlvdata; + } + + skb_reset_network_header(skb); + return action; +} + +/*XXX: check if we can do this at install time instead of current + * send data path +**/ +static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) +{ + struct tcf_meta_info *e, *n; + int tot_run_sz = 0, run_sz = 0; + + list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + if (e->ops->check_presence) { + run_sz = e->ops->check_presence(skb, e); + tot_run_sz += run_sz; + } + } + + return tot_run_sz; +} + +static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_ife_info *ife = a->priv; + int action = ife->tcf_action; + struct ethhdr *oethh; /* outer ether header */ + struct ethhdr *iethh; /* inner eth header */ + struct tcf_meta_info *e; + /* + OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA + where ORIGDATA = original ethernet header ... + */ + u16 metalen = ife_get_sz(skb, ife); + int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; + unsigned int skboff = skb->dev->hard_header_len; + u32 at = G_TC_AT(skb->tc_verd); + int new_len = skb->len + hdrm; + bool exceed_mtu = false; + int err; + + if (at & AT_EGRESS) { + if (new_len > skb->dev->mtu) + exceed_mtu = true; + } + + spin_lock(&ife->tcf_lock); + bstats_update(&ife->tcf_bstats, skb); + ife->tcf_tm.lastuse = jiffies; + + if (!metalen) { /* no metadata to send */ + /* abuse overlimits to count when we allow packet + * with no metadata + */ + ife->tcf_qstats.overlimits++; + spin_unlock(&ife->tcf_lock); + return action; + } + /* could be stupid policy setup or mtu config + * so lets be conservative.. */ + if ((action == TC_ACT_SHOT) || exceed_mtu) { + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + + iethh = eth_hdr(skb); + + err = skb_cow_head(skb, hdrm); + if (unlikely(err)) { + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + + if (!(at & AT_EGRESS)) + skb_push(skb, skb->dev->hard_header_len); + + __skb_push(skb, hdrm); + memcpy(skb->data, iethh, skb->mac_len); + skb_reset_mac_header(skb); + oethh = eth_hdr(skb); + + /*total metadata length */ + metalen += IFE_METAHDRLEN; + metalen = htons(metalen); + memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN); + skboff += IFE_METAHDRLEN; + + /* XXX: we dont have a clever way of telling encode to + * not repeat some of the computations that are done by + * ops->presence_check... + */ + list_for_each_entry(e, &ife->metalist, metalist) { + if (e->ops->encode) { + err = e->ops->encode(skb, (void *)(skb->data + skboff), + e); + } + if (err < 0) { + /* too corrupt to keep around if overwritten */ + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; + } + skboff += err; + } + + if (!is_zero_ether_addr(ife->eth_src)) + ether_addr_copy(oethh->h_source, ife->eth_src); + else + ether_addr_copy(oethh->h_source, iethh->h_source); + if (!is_zero_ether_addr(ife->eth_dst)) + ether_addr_copy(oethh->h_dest, ife->eth_dst); + else + ether_addr_copy(oethh->h_dest, iethh->h_dest); + oethh->h_proto = htons(ife->eth_type); + + if (!(at & AT_EGRESS)) + skb_pull(skb, skb->dev->hard_header_len); + + spin_unlock(&ife->tcf_lock); + + return action; +} + +static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_ife_info *ife = a->priv; + + if (ife->flags & IFE_ENCODE) + return tcf_ife_encode(skb, a, res); + + if (!(ife->flags & IFE_ENCODE)) + return tcf_ife_decode(skb, a, res); + + pr_info_ratelimited("unknown failure(policy neither de/encode\n"); + spin_lock(&ife->tcf_lock); + bstats_update(&ife->tcf_bstats, skb); + ife->tcf_tm.lastuse = jiffies; + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + + return TC_ACT_SHOT; +} + +static int tcf_ife_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_ife_ops = { + .kind = "ife", + .type = TCA_ACT_IFE, + .owner = THIS_MODULE, + .act = tcf_ife_act, + .dump = tcf_ife_dump, + .cleanup = tcf_ife_cleanup, + .init = tcf_ife_init, + .walk = tcf_ife_walker, + .lookup = tcf_ife_search, +}; + +static __net_init int ife_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK); +} + +static void __net_exit ife_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations ife_net_ops = { + .init = ife_init_net, + .exit = ife_exit_net, + .id = &ife_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init ife_init_module(void) +{ + return tcf_register_action(&act_ife_ops, &ife_net_ops); +} + +static void __exit ife_cleanup_module(void) +{ + tcf_unregister_action(&act_ife_ops, &ife_net_ops); +} + +module_init(ife_init_module); +module_exit(ife_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2015)"); +MODULE_DESCRIPTION("Inter-FE LFB action"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 6b70399ab..8b5270008 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -30,6 +30,10 @@ #define IPT_TAB_MASK 15 +static int ipt_net_id; + +static int xt_net_id; + static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { struct xt_tgchk_param par; @@ -84,14 +88,15 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; - int ret = 0, err; + int ret = 0, err, exists = 0; u32 hook = 0; u32 index = 0; @@ -102,20 +107,26 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (err < 0) return err; - if (tb[TCA_IPT_HOOK] == NULL) - return -EINVAL; - if (tb[TCA_IPT_TARG] == NULL) + if (tb[TCA_IPT_INDEX] != NULL) + index = nla_get_u32(tb[TCA_IPT_INDEX]); + + exists = tcf_hash_check(tn, index, a, bind); + if (exists && bind) + return 0; + + if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { + if (exists) + tcf_hash_release(a, bind); return -EINVAL; + } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) return -EINVAL; - if (tb[TCA_IPT_INDEX] != NULL) - index = nla_get_u32(tb[TCA_IPT_INDEX]); - - if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); + if (!tcf_hash_check(tn, index, a, bind)) { + ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, + false); if (ret) return ret; ret = ACT_P_CREATED; @@ -158,7 +169,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; err3: @@ -171,6 +182,24 @@ err1: return err; } +static int tcf_ipt_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return __tcf_ipt_init(tn, nla, est, a, ovr, bind); +} + +static int tcf_xt_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return __tcf_ipt_init(tn, nla, est, a, ovr, bind); +} + static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -262,6 +291,22 @@ nla_put_failure: return -1; } +static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -270,8 +315,47 @@ static struct tc_action_ops act_ipt_ops = { .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, + .walk = tcf_ipt_walker, + .lookup = tcf_ipt_search, +}; + +static __net_init int ipt_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK); +} + +static void __net_exit ipt_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations ipt_net_ops = { + .init = ipt_init_net, + .exit = ipt_exit_net, + .id = &ipt_net_id, + .size = sizeof(struct tc_action_net), }; +static int tcf_xt_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -279,7 +363,30 @@ static struct tc_action_ops act_xt_ops = { .act = tcf_ipt, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, - .init = tcf_ipt_init, + .init = tcf_xt_init, + .walk = tcf_xt_walker, + .lookup = tcf_xt_search, +}; + +static __net_init int xt_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK); +} + +static void __net_exit xt_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations xt_net_ops = { + .init = xt_init_net, + .exit = xt_exit_net, + .id = &xt_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); @@ -291,12 +398,13 @@ static int __init ipt_init_module(void) { int ret1, ret2; - ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); + ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops); if (ret1 < 0) - printk("Failed to load xt action\n"); - ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); + pr_err("Failed to load xt action\n"); + + ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops); if (ret2 < 0) - printk("Failed to load ipt action\n"); + pr_err("Failed to load ipt action\n"); if (ret1 < 0 && ret2 < 0) { return ret1; @@ -306,8 +414,8 @@ static int __init ipt_init_module(void) static void __exit ipt_cleanup_module(void) { - tcf_unregister_action(&act_xt_ops); - tcf_unregister_action(&act_ipt_ops); + tcf_unregister_action(&act_ipt_ops, &ipt_net_ops); + tcf_unregister_action(&act_xt_ops, &xt_net_ops); } module_init(ipt_init_module); diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c new file mode 100644 index 000000000..82892170c --- /dev/null +++ b/net/sched/act_meta_mark.c @@ -0,0 +1,79 @@ +/* + * net/sched/act_meta_mark.c IFE skb->mark metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2015) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> +#include <linux/rtnetlink.h> + +static int skbmark_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifemark = skb->mark; + + return ife_encode_meta_u32(ifemark, skbdata, e); +} + +static int skbmark_decode(struct sk_buff *skb, void *data, u16 len) +{ + u32 ifemark = *(u32 *)data; + + skb->mark = ntohl(ifemark); + return 0; +} + +static int skbmark_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u32(skb->mark, e); +} + +static struct tcf_meta_ops ife_skbmark_ops = { + .metaid = IFE_META_SKBMARK, + .metatype = NLA_U32, + .name = "skbmark", + .synopsis = "skb mark 32 bit metadata", + .check_presence = skbmark_check, + .encode = skbmark_encode, + .decode = skbmark_decode, + .get = ife_get_meta_u32, + .alloc = ife_alloc_meta_u32, + .release = ife_release_meta_gen, + .validate = ife_validate_meta_u32, + .owner = THIS_MODULE, +}; + +static int __init ifemark_init_module(void) +{ + return register_ife_op(&ife_skbmark_ops); +} + +static void __exit ifemark_cleanup_module(void) +{ + unregister_ife_op(&ife_skbmark_ops); +} + +module_init(ifemark_init_module); +module_exit(ifemark_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2015)"); +MODULE_DESCRIPTION("Inter-FE skb mark metadata module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_SKBMARK); diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c new file mode 100644 index 000000000..26bf4d860 --- /dev/null +++ b/net/sched/act_meta_skbprio.c @@ -0,0 +1,76 @@ +/* + * net/sched/act_meta_prio.c IFE skb->priority metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2015) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> + +static int skbprio_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u32(skb->priority, e); +} + +static int skbprio_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifeprio = skb->priority; /* avoid having to cast skb->priority*/ + + return ife_encode_meta_u32(ifeprio, skbdata, e); +} + +static int skbprio_decode(struct sk_buff *skb, void *data, u16 len) +{ + u32 ifeprio = *(u32 *)data; + + skb->priority = ntohl(ifeprio); + return 0; +} + +static struct tcf_meta_ops ife_prio_ops = { + .metaid = IFE_META_PRIO, + .metatype = NLA_U32, + .name = "skbprio", + .synopsis = "skb prio metadata", + .check_presence = skbprio_check, + .encode = skbprio_encode, + .decode = skbprio_decode, + .get = ife_get_meta_u32, + .alloc = ife_alloc_meta_u32, + .owner = THIS_MODULE, +}; + +static int __init ifeprio_init_module(void) +{ + return register_ife_op(&ife_prio_ops); +} + +static void __exit ifeprio_cleanup_module(void) +{ + unregister_ife_op(&ife_prio_ops); +} + +module_init(ifeprio_init_module); +module_exit(ifeprio_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2015)"); +MODULE_DESCRIPTION("Inter-FE skb prio metadata action"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_PRIO); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 32fcdecdb..8f3948dd3 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -50,15 +50,18 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; +static int mirred_net_id; + static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; - int ret, ok_push = 0; + int ret, ok_push = 0, exists = 0; if (nla == NULL) return -EINVAL; @@ -68,17 +71,27 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (tb[TCA_MIRRED_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_MIRRED_PARMS]); + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + switch (parm->eaction) { case TCA_EGRESS_MIRROR: case TCA_EGRESS_REDIR: break; default: + if (exists) + tcf_hash_release(a, bind); return -EINVAL; } if (parm->ifindex) { dev = __dev_get_by_index(net, parm->ifindex); - if (dev == NULL) + if (dev == NULL) { + if (exists) + tcf_hash_release(a, bind); return -ENODEV; + } switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: @@ -96,18 +109,15 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev = NULL; } - if (!tcf_hash_check(parm->index, a, bind)) { + if (!exists) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), - bind, true); + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*m), bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { - if (bind) - return 0; - tcf_hash_release(a, bind); if (!ovr) return -EEXIST; @@ -130,7 +140,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock_bh(&mirred_list_lock); - tcf_hash_insert(a); + tcf_hash_insert(tn, a); } return ret; @@ -179,7 +189,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; - skb_sender_cpu_clear(skb2); err = dev_queue_xmit(skb2); if (err) { @@ -221,6 +230,22 @@ nla_put_failure: return -1; } +static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_hash_search(tn, a, index); +} + static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -257,6 +282,29 @@ static struct tc_action_ops act_mirred_ops = { .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, + .walk = tcf_mirred_walker, + .lookup = tcf_mirred_search, +}; + +static __net_init int mirred_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK); +} + +static void __net_exit mirred_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations mirred_net_ops = { + .init = mirred_init_net, + .exit = mirred_exit_net, + .id = &mirred_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); @@ -270,12 +318,12 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK); + return tcf_register_action(&act_mirred_ops, &mirred_net_ops); } static void __exit mirred_cleanup_module(void) { - tcf_unregister_action(&act_mirred_ops); + tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b7c4ead8b..0f65cdfbf 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -31,6 +31,8 @@ #define NAT_TAB_MASK 15 +static int nat_net_id; + static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; @@ -38,6 +40,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; struct tc_nat *parm; int ret = 0, err; @@ -54,9 +57,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -79,7 +82,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -126,9 +129,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, addr = iph->daddr; if (!((old_addr ^ addr) & mask)) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + noff)) goto drop; new_addr &= mask; @@ -156,9 +157,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcphdr *tcph; if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff)) goto drop; tcph = (void *)(skb_network_header(skb) + ihl); @@ -171,9 +170,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct udphdr *udph; if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*udph) + noff)) goto drop; udph = (void *)(skb_network_header(skb) + ihl); @@ -213,10 +210,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, if ((old_addr ^ addr) & mask) break; - if (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*icmph) + - sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, ihl + sizeof(*icmph) + + sizeof(*iph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); @@ -282,6 +277,22 @@ nla_put_failure: return -1; } +static int tcf_nat_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -289,6 +300,29 @@ static struct tc_action_ops act_nat_ops = { .act = tcf_nat, .dump = tcf_nat_dump, .init = tcf_nat_init, + .walk = tcf_nat_walker, + .lookup = tcf_nat_search, +}; + +static __net_init int nat_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK); +} + +static void __net_exit nat_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations nat_net_ops = { + .init = nat_init_net, + .exit = nat_exit_net, + .id = &nat_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Stateless NAT actions"); @@ -296,12 +330,12 @@ MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { - return tcf_register_action(&act_nat_ops, NAT_TAB_MASK); + return tcf_register_action(&act_nat_ops, &nat_net_ops); } static void __exit nat_cleanup_module(void) { - tcf_unregister_action(&act_nat_ops); + tcf_unregister_action(&act_nat_ops, &nat_net_ops); } module_init(nat_init_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e38a7701f..429c3ab65 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -25,6 +25,8 @@ #define PEDIT_TAB_MASK 15 +static int pedit_net_id; + static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; @@ -33,6 +35,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; int ret = 0, err; @@ -54,11 +57,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; - if (!tcf_hash_check(parm->index, a, bind)) { + if (!tcf_hash_check(tn, parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; p = to_pedit(a); @@ -93,7 +96,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -211,6 +214,22 @@ nla_put_failure: return -1; } +static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -219,6 +238,29 @@ static struct tc_action_ops act_pedit_ops = { .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, + .walk = tcf_pedit_walker, + .lookup = tcf_pedit_search, +}; + +static __net_init int pedit_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK); +} + +static void __net_exit pedit_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations pedit_net_ops = { + .init = pedit_init_net, + .exit = pedit_exit_net, + .id = &pedit_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -227,12 +269,12 @@ MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { - return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK); + return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { - tcf_unregister_action(&act_pedit_ops); + tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 9a1c42a43..330f14e30 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -55,10 +55,14 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ -static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +static int police_net_id; + +static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_hashinfo *hinfo = tn->hinfo; struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; @@ -121,7 +125,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_hashinfo *hinfo = tn->hinfo; int size; if (nla == NULL) @@ -139,7 +144,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - if (tcf_hash_search(a, parm->index)) { + if (tcf_hash_search(tn, a, parm->index)) { police = to_police(a->priv); if (bind) { police->tcf_bindcnt += 1; @@ -233,7 +238,7 @@ override: police->tcfp_t_c = ktime_get_ns(); police->tcf_index = parm->index ? parm->index : - tcf_hash_new_index(hinfo); + tcf_hash_new_index(tn); h = tcf_hash(police->tcf_index, POL_TAB_MASK); spin_lock_bh(&hinfo->lock); hlist_add_head(&police->tcf_head, &hinfo->htab[h]); @@ -342,6 +347,13 @@ nla_put_failure: return -1; } +static int tcf_police_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_hash_search(tn, a, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -353,19 +365,41 @@ static struct tc_action_ops act_police_ops = { .act = tcf_act_police, .dump = tcf_act_police_dump, .init = tcf_act_police_locate, - .walk = tcf_act_police_walker + .walk = tcf_act_police_walker, + .lookup = tcf_police_search, +}; + +static __net_init int police_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK); +} + +static void __net_exit police_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations police_net_ops = { + .init = police_init_net, + .exit = police_exit_net, + .id = &police_net_id, + .size = sizeof(struct tc_action_net), }; static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops, POL_TAB_MASK); + return tcf_register_action(&act_police_ops, &police_net_ops); } static void __exit police_cleanup_module(void) { - tcf_unregister_action(&act_police_ops); + tcf_unregister_action(&act_police_ops, &police_net_ops); } module_init(police_init_module); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index d6b708d6a..3a33fb648 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -26,6 +26,8 @@ #define SIMP_TAB_MASK 7 +static int simp_net_id; + #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -80,11 +82,12 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; char *defdata; - int ret = 0, err; + int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; @@ -96,15 +99,23 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; - if (tb[TCA_DEF_DATA] == NULL) - return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (tb[TCA_DEF_DATA] == NULL) { + if (exists) + tcf_hash_release(a, bind); + return -EINVAL; + } + defdata = nla_data(tb[TCA_DEF_DATA]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), - bind, false); + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*d), bind, false); if (ret) return ret; @@ -119,8 +130,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } else { d = to_defact(a); - if (bind) - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; @@ -129,7 +138,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -161,6 +170,22 @@ nla_put_failure: return -1; } +static int tcf_simp_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -169,6 +194,29 @@ static struct tc_action_ops act_simp_ops = { .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, + .walk = tcf_simp_walker, + .lookup = tcf_simp_search, +}; + +static __net_init int simp_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK); +} + +static void __net_exit simp_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations simp_net_ops = { + .init = simp_init_net, + .exit = simp_exit_net, + .id = &simp_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -177,8 +225,7 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret; - ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK); + int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; @@ -186,7 +233,7 @@ static int __init simp_init_module(void) static void __exit simp_cleanup_module(void) { - tcf_unregister_action(&act_simp_ops); + tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6751b5f8c..69da5a8f0 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -29,6 +29,8 @@ #define SKBEDIT_TAB_MASK 15 +static int skbedit_net_id; + static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -61,12 +63,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; - int ret = 0, err; + int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; @@ -93,14 +96,20 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mark = nla_data(tb[TCA_SKBEDIT_MARK]); } - if (!flags) - return -EINVAL; - parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), - bind, false); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (!flags) { + tcf_hash_release(a, bind); + return -EINVAL; + } + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*d), bind, false); if (ret) return ret; @@ -108,8 +117,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { d = to_skbedit(a); - if (bind) - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; @@ -130,7 +137,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -173,6 +180,22 @@ nla_put_failure: return -1; } +static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -180,6 +203,29 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .walk = tcf_skbedit_walker, + .lookup = tcf_skbedit_search, +}; + +static __net_init int skbedit_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK); +} + +static void __net_exit skbedit_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations skbedit_net_ops = { + .init = skbedit_init_net, + .exit = skbedit_exit_net, + .id = &skbedit_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); @@ -188,12 +234,12 @@ MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { - return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK); + return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops); } static void __exit skbedit_cleanup_module(void) { - tcf_unregister_action(&act_skbedit_ops); + tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops); } module_init(skbedit_init_module); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 796785e0b..c45f926da 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -21,6 +21,8 @@ #define VLAN_TAB_MASK 15 +static int vlan_net_id; + static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -68,13 +70,14 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tc_vlan *parm; struct tcf_vlan *v; int action; __be16 push_vid = 0; __be16 push_proto = 0; - int ret = 0; + int ret = 0, exists = 0; int err; if (!nla) @@ -87,15 +90,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + switch (parm->v_action) { case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: - if (!tb[TCA_VLAN_PUSH_VLAN_ID]) + if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { + if (exists) + tcf_hash_release(a, bind); return -EINVAL; + } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); - if (push_vid >= VLAN_VID_MASK) + if (push_vid >= VLAN_VID_MASK) { + if (exists) + tcf_hash_release(a, bind); return -ERANGE; + } if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) { push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]); @@ -111,20 +124,20 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } break; default: + if (exists) + tcf_hash_release(a, bind); return -EINVAL; } action = parm->v_action; - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), - bind, false); + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*v), bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - if (bind) - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; @@ -143,7 +156,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&v->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -181,6 +194,22 @@ nla_put_failure: return -1; } +static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -188,16 +217,39 @@ static struct tc_action_ops act_vlan_ops = { .act = tcf_vlan, .dump = tcf_vlan_dump, .init = tcf_vlan_init, + .walk = tcf_vlan_walker, + .lookup = tcf_vlan_search, +}; + +static __net_init int vlan_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK); +} + +static void __net_exit vlan_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations vlan_net_ops = { + .init = vlan_init_net, + .exit = vlan_exit_net, + .id = &vlan_net_id, + .size = sizeof(struct tc_action_net), }; static int __init vlan_init_module(void) { - return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK); + return tcf_register_action(&act_vlan_ops, &vlan_net_ops); } static void __exit vlan_cleanup_module(void) { - tcf_unregister_action(&act_vlan_ops); + tcf_unregister_action(&act_vlan_ops, &vlan_net_ops); } module_init(vlan_init_module); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 8dc84300e..425fe6a0e 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -103,8 +103,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } if (prog->exts_integrated) { - res->class = prog->res.class; - res->classid = qdisc_skb_cb(skb)->tc_classid; + res->class = 0; + res->classid = TC_H_MAJ(prog->res.classid) | + qdisc_skb_cb(skb)->tc_classid; ret = cls_bpf_exec_opcode(filter_res); if (ret == TC_ACT_UNSPEC) @@ -114,10 +115,12 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (filter_res == 0) continue; - - *res = prog->res; - if (filter_res != -1) + if (filter_res != -1) { + res->class = 0; res->classid = filter_res; + } else { + *res = prog->res; + } ret = tcf_exts_exec(skb, &prog->exts, res); if (ret < 0) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 95b021243..2181ffc76 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head) kfree(f); } +static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, 0)) + return; + + offload.command = TC_CLSFLOWER_DESTROY; + offload.cookie = cookie; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + +static void fl_hw_replace_filter(struct tcf_proto *tp, + struct flow_dissector *dissector, + struct fl_flow_key *mask, + struct fl_flow_key *key, + struct tcf_exts *actions, + unsigned long cookie, u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, flags)) + return; + + offload.command = TC_CLSFLOWER_REPLACE; + offload.cookie = cookie; + offload.dissector = dissector; + offload.mask = mask; + offload.key = key; + offload.exts = actions; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + static bool fl_destroy(struct tcf_proto *tp, bool force) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force) return false; list_for_each_entry_safe(f, next, &head->filters, list) { + fl_hw_destroy_filter(tp, (unsigned long)f); list_del_rcu(&f->list); call_rcu(&f->rcu, fl_destroy_filter); } @@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_filter *fnew; struct nlattr *tb[TCA_FLOWER_MAX + 1]; struct fl_flow_mask mask = {}; + u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } fnew->handle = handle; + if (tb[TCA_FLOWER_FLAGS]) + flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) goto errout; @@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, head->ht_params); if (err) goto errout; - if (fold) + + fl_hw_replace_filter(tp, + &head->dissector, + &mask.key, + &fnew->key, + &fnew->exts, + (unsigned long)fnew, + flags); + + if (fold) { rhashtable_remove_fast(&head->ht, &fold->ht_node, head->ht_params); + fl_hw_destroy_filter(tp, (unsigned long)fold); + } *arg = (unsigned long) fnew; @@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg) rhashtable_remove_fast(&head->ht, &f->ht_node, head->ht_params); list_del_rcu(&f->list); + fl_hw_destroy_filter(tp, (unsigned long)f); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fl_destroy_filter); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4fbb67430..563cdad76 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -43,6 +43,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <linux/netdevice.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -58,6 +59,7 @@ struct tc_u_knode { #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt __percpu *pf; #endif + u32 flags; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -424,6 +426,97 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } +static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, 0)) { + offload.cls_u32->command = TC_CLSU32_DELETE_KNODE; + offload.cls_u32->knode.handle = handle; + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_hnode(struct tcf_proto *tp, + struct tc_u_hnode *h, + u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, flags)) { + offload.cls_u32->command = TC_CLSU32_NEW_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, 0)) { + offload.cls_u32->command = TC_CLSU32_DELETE_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_knode(struct tcf_proto *tp, + struct tc_u_knode *n, + u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, flags)) { + offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE; + offload.cls_u32->knode.handle = n->handle; + offload.cls_u32->knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + offload.cls_u32->knode.val = n->val; + offload.cls_u32->knode.mask = n->mask; +#else + offload.cls_u32->knode.val = 0; + offload.cls_u32->knode.mask = 0; +#endif + offload.cls_u32->knode.sel = &n->sel; + offload.cls_u32->knode.exts = &n->exts; + if (n->ht_down) + offload.cls_u32->knode.link_handle = n->ht_down->handle; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; @@ -434,6 +527,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); + u32_remove_hw_knode(tp, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -454,6 +548,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { + u32_clear_hw_hnode(tp, ht); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -540,8 +635,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) if (ht == NULL) return 0; - if (TC_U32_KEY(ht->handle)) + if (TC_U32_KEY(ht->handle)) { + u32_remove_hw_knode(tp, ht->handle); return u32_delete_key(tp, (struct tc_u_knode *)ht); + } if (root_ht == ht) return -EINVAL; @@ -587,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, + [TCA_U32_FLAGS] = { .type = NLA_U32 }, }; static int u32_set_parms(struct net *net, struct tcf_proto *tp, @@ -694,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, #endif new->fshift = n->fshift; new->res = n->res; + new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, n->ht_down); /* bump reference count as long as we hold pointer to structure */ @@ -733,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; - u32 htid; + u32 htid, flags = 0; int err; #ifdef CONFIG_CLS_U32_PERF size_t size; @@ -746,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; + if (tb[TCA_U32_FLAGS]) + flags = nla_get_u32(tb[TCA_U32_FLAGS]); + n = (struct tc_u_knode *)*arg; if (n) { struct tc_u_knode *new; @@ -753,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; + if (n->flags != flags) + return -EINVAL; + new = u32_init_knode(tp, n); if (!new) return -ENOMEM; @@ -769,6 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); + u32_replace_hw_knode(tp, new, flags); return 0; } @@ -795,6 +901,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; + + u32_replace_hw_hnode(tp, ht, flags); return 0; } @@ -845,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + n->flags = flags; tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); n->tp = tp; @@ -877,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); - + u32_replace_hw_knode(tp, n, flags); *arg = (unsigned long)n; return 0; } @@ -982,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; + if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags)) + goto nla_put_failure; + #ifdef CONFIG_CLS_U32_MARK if ((n->val || n->mask)) { struct tc_u32_mark mark = {.val = n->val, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index af1acf009..3b180ff72 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -744,14 +744,15 @@ static u32 qdisc_alloc_handle(struct net_device *dev) return 0; } -void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) +void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, + unsigned int len) { const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; int drops; - if (n == 0) + if (n == 0 && len == 0) return; drops = max_t(int, n, 0); rcu_read_lock(); @@ -774,11 +775,12 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; + sch->qstats.backlog -= len; __qdisc_qstats_drop(sch, drops); } rcu_read_unlock(); } -EXPORT_SYMBOL(qdisc_tree_decrease_qlen); +EXPORT_SYMBOL(qdisc_tree_reduce_backlog); static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, @@ -1841,7 +1843,7 @@ reclassify: return err; } - return -1; + return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= MAX_REC_LOOP)) { diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c538d9e4a..baafddf22 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1624,13 +1624,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new->reshape_fail = cbq_reshape_fail; #endif } - sch_tree_lock(sch); - *old = cl->q; - cl->q = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->q); return 0; } @@ -1914,7 +1909,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - unsigned int qlen; + unsigned int qlen, backlog; if (cl->filters || cl->children || cl == &q->link) return -EBUSY; @@ -1922,8 +1917,9 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); qlen = cl->q->q.qlen; + backlog = cl->q->qstats.backlog; qdisc_reset(cl->q); - qdisc_tree_decrease_qlen(cl->q, qlen); + qdisc_tree_reduce_backlog(cl->q, qlen, backlog); if (cl->next_alive) cbq_deactivate_class(cl); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 5ffb8b833..0a08c860e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -128,8 +128,8 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) choke_zap_tail_holes(q); qdisc_qstats_backlog_dec(sch, skb); + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch); - qdisc_tree_decrease_qlen(sch, 1); --sch->q.qlen; } @@ -456,6 +456,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) old = q->tab; if (old) { unsigned int oqlen = sch->q.qlen, tail = 0; + unsigned dropped = 0; while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; @@ -467,11 +468,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ntab[tail++] = skb; continue; } + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; q->tail = tail; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 535007d5f..9b7e2980e 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -79,12 +79,13 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); - /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. */ if (q->stats.drop_count && sch->q.qlen) { - qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; + q->stats.drop_len = 0; } if (skb) qdisc_bstats_update(sch, skb); @@ -116,7 +117,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) { struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen; + unsigned int qlen, dropped = 0; int err; if (!opt) @@ -156,10 +157,11 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index a1cd77824..a63e879e8 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -53,9 +53,10 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) static void drr_purge_queue(struct drr_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { @@ -226,11 +227,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - drr_purge_queue(cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index f357f34d0..34b4ddaca 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -73,13 +73,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - *old = p->q; - p->q = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &p->q); return 0; } @@ -264,6 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -281,11 +276,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - skb = p->q->ops->dequeue(p->q); + skb = qdisc_dequeue_peeked(p->q); if (skb == NULL) return NULL; qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; index = skb->tc_index & (p->indices - 1); @@ -401,6 +397,7 @@ static void dsmark_reset(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); qdisc_reset(p->q); + sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 109b23227..3c6a47d66 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -662,6 +662,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; int err, drop_count = 0; + unsigned drop_len = 0; u32 fq_log; if (!opt) @@ -736,10 +737,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!skb) break; + drop_len += qdisc_pkt_len(skb); kfree_skb(skb); drop_count++; } - qdisc_tree_decrease_qlen(sch, drop_count); + qdisc_tree_reduce_backlog(sch, drop_count, drop_len); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4c834e93d..d3fc8f9dd 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -175,7 +175,7 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - unsigned int idx; + unsigned int idx, prev_backlog; struct fq_codel_flow *flow; int uninitialized_var(ret); @@ -203,6 +203,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; + prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet * from this flow. @@ -211,7 +212,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } @@ -241,6 +242,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) struct fq_codel_flow *flow; struct list_head *head; u32 prev_drop_count, prev_ecn_mark; + unsigned int prev_backlog; begin: head = &q->new_flows; @@ -259,6 +261,7 @@ begin: prev_drop_count = q->cstats.drop_count; prev_ecn_mark = q->cstats.ecn_mark; + prev_backlog = sch->qstats.backlog; skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, dequeue); @@ -276,12 +279,14 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. */ if (q->cstats.drop_count && sch->q.qlen) { - qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, + q->cstats.drop_len); q->cstats.drop_count = 0; + q->cstats.drop_len = 0; } return skb; } @@ -372,11 +377,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_codel_dequeue(sch); + q->cstats.drop_len += qdisc_pkt_len(skb); kfree_skb(skb); q->cstats.drop_count++; } - qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; + q->cstats.drop_len = 0; sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 16bc83b28..80742edea 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -159,12 +159,15 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, if (validate) skb = validate_xmit_skb_list(skb, dev); - if (skb) { + if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); + } else { + spin_lock(root_lock); + return qdisc_qlen(q); } spin_lock(root_lock); @@ -567,6 +570,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b7ebe2c87..d783d7cc3 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -895,9 +895,10 @@ static void hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static void @@ -1215,11 +1216,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; } - sch_tree_lock(sch); - hfsc_purge_queue(sch, cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 86b04e31e..13d6f83ec 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -382,6 +382,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; + unsigned int prev_backlog; idx = hhf_classify(skb, sch); @@ -409,6 +410,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; + prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. @@ -417,7 +419,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } @@ -527,7 +529,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen; + unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -577,12 +579,14 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) } qlen = sch->q.qlen; + prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); kfree_skb(skb); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, + prev_backlog - sch->qstats.backlog); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 15ccd7f8f..87b02ed3d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -600,6 +600,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) htb_activate(q, cl); } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -889,6 +890,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) ok: qdisc_bstats_update(sch, skb); qdisc_unthrottled(sch); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -955,6 +957,7 @@ static unsigned int htb_drop(struct Qdisc *sch) unsigned int len; if (cl->un.leaf.q->ops->drop && (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + sch->qstats.backlog -= len; sch->q.qlen--; if (!cl->un.leaf.q->q.qlen) htb_deactivate(q, cl); @@ -984,12 +987,12 @@ static void htb_reset(struct Qdisc *sch) } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; - } } qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; + sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); for (i = 0; i < TC_HTB_NUMPRIO; i++) @@ -1163,14 +1166,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, cl->common.classid)) == NULL) return -ENOBUFS; - sch_tree_lock(sch); - *old = cl->un.leaf.q; - cl->un.leaf.q = new; - if (*old != NULL) { - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - } - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->un.leaf.q); return 0; } @@ -1272,7 +1268,6 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - unsigned int qlen; struct Qdisc *new_q = NULL; int last_child = 0; @@ -1292,9 +1287,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); if (!cl->level) { - qlen = cl->un.leaf.q->q.qlen; + unsigned int qlen = cl->un.leaf.q->q.qlen; + unsigned int backlog = cl->un.leaf.q->qstats.backlog; + qdisc_reset(cl->un.leaf.q); - qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen); + qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog); } /* delete from hash and active; remainder in destroy_class */ @@ -1428,10 +1425,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_lock(sch); if (parent && !parent->level) { unsigned int qlen = parent->un.leaf.q->q.qlen; + unsigned int backlog = parent->un.leaf.q->qstats.backlog; /* turn parent into inner node */ qdisc_reset(parent->un.leaf.q); - qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen); + qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog); qdisc_destroy(parent->un.leaf.q); if (parent->prio_activity) htb_deactivate(q, parent); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 3e82f047c..56a77b878 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, + qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index ad70ecf57..b8002ce3d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO}; unsigned int ntx; if (priv->qdiscs) { @@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) - dev->netdev_ops->ndo_setup_tc(dev, 0); + dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); else netdev_set_num_tc(dev, 0); } @@ -124,7 +125,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, + qdisc = qdisc_create_dflt(dev_queue, + get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { @@ -140,8 +142,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, + { .tc = qopt->num_tc }}; + priv->hw_owned = 1; - err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) goto err; } else { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 4e904ca0a..bcdd54bb1 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -218,7 +218,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_tree_reduce_backlog(child, child->q.qlen, + child->qstats.backlog); qdisc_destroy(child); } } @@ -238,8 +239,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) q->queues[i] = child; if (old != &noop_qdisc) { - qdisc_tree_decrease_qlen(old, - old->q.qlen); + qdisc_tree_reduce_backlog(old, + old->q.qlen, + old->qstats.backlog); qdisc_destroy(old); } sch_tree_unlock(sch); @@ -303,13 +305,7 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->queues[band]; - q->queues[band] = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5abd1d9de..4befe97a9 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -395,6 +395,25 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) sch->q.qlen++; } +/* netem can't properly corrupt a megapacket (like we get from GSO), so instead + * when we statistically choose to corrupt one, we instead segment it, returning + * the first packet to be corrupted, and re-enqueue the remaining frames + */ +static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct sk_buff *segs; + netdev_features_t features = netif_skb_features(skb); + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) { + qdisc_reshape_fail(skb, sch); + return NULL; + } + consume_skb(skb); + return segs; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -407,7 +426,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2; + struct sk_buff *segs = NULL; + unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); + int nb = 0; int count = 1; + int rc = NET_XMIT_SUCCESS; /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) @@ -453,10 +476,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (skb_is_gso(skb)) { + segs = netem_segment(skb, sch); + if (!segs) + return NET_XMIT_DROP; + } else { + segs = skb; + } + + skb = segs; + segs = segs->next; + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) - return qdisc_drop(skb, sch); + skb_checksum_help(skb))) { + rc = qdisc_drop(skb, sch); + goto finish_segs; + } skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8); @@ -516,6 +552,27 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) sch->qstats.requeues++; } +finish_segs: + if (segs) { + while (segs) { + skb2 = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + last_len = segs->len; + rc = qdisc_enqueue(segs, sch); + if (rc != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(rc)) + qdisc_qstats_drop(sch); + } else { + nb++; + len += last_len; + } + segs = skb2; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); + } return NET_XMIT_SUCCESS; } @@ -598,7 +655,8 @@ deliver: if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, + qdisc_pkt_len(skb)); } } goto tfifo_dequeue; @@ -1037,15 +1095,7 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct netem_sched_data *q = qdisc_priv(sch); - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - if (*old) { - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - } - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b783a446d..71ae3b962 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -183,7 +183,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen; + unsigned int qlen, dropped = 0; int err; if (!opt) @@ -232,10 +232,11 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index ba6487f27..fee1b1550 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -191,7 +191,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; if (child != &noop_qdisc) { - qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); qdisc_destroy(child); } } @@ -210,8 +210,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) q->queues[i] = child; if (old != &noop_qdisc) { - qdisc_tree_decrease_qlen(old, - old->q.qlen); + qdisc_tree_reduce_backlog(old, + old->q.qlen, + old->qstats.backlog); qdisc_destroy(old); } sch_tree_unlock(sch); @@ -268,13 +269,7 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->queues[band]; - q->queues[band] = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3dc3a6e56..8d2d8d953 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -220,9 +220,10 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) static void qfq_purge_queue(struct qfq_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { @@ -617,11 +618,7 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - qfq_purge_queue(cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6c0534cc7..8c0508c0e 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -210,7 +210,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) q->flags = ctl->flags; q->limit = ctl->limit; if (child) { - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; } @@ -313,12 +314,7 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 5bbb6332e..c69611640 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -510,7 +510,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; @@ -606,12 +607,7 @@ static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 3abab534e..498f0a2cb 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -346,7 +346,7 @@ static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - unsigned int hash; + unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; int uninitialized_var(ret); @@ -461,7 +461,7 @@ enqueue: return NET_XMIT_SUCCESS; qlen = slot->qlen; - sfq_drop(sch); + dropped = sfq_drop(sch); /* Return Congestion Notification only if we dropped a packet * from this flow. */ @@ -469,7 +469,7 @@ enqueue: return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, dropped); return NET_XMIT_SUCCESS; } @@ -537,6 +537,7 @@ static void sfq_rehash(struct Qdisc *sch) struct sfq_slot *slot; struct sk_buff_head list; int dropped = 0; + unsigned int drop_len = 0; __skb_queue_head_init(&list); @@ -565,6 +566,7 @@ static void sfq_rehash(struct Qdisc *sch) if (x >= SFQ_MAX_FLOWS) { drop: qdisc_qstats_backlog_dec(sch, skb); + drop_len += qdisc_pkt_len(skb); kfree_skb(skb); dropped++; continue; @@ -594,7 +596,7 @@ drop: } } sch->q.qlen -= dropped; - qdisc_tree_decrease_qlen(sch, dropped); + qdisc_tree_reduce_backlog(sch, dropped, drop_len); } static void sfq_perturbation(unsigned long arg) @@ -618,7 +620,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; - unsigned int qlen; + unsigned int qlen, dropped = 0; struct red_parms *p = NULL; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) @@ -667,8 +669,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) qlen = sch->q.qlen; while (sch->q.qlen > q->limit) - sfq_drop(sch); - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + dropped += sfq_drop(sch); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); del_timer(&q->perturb_timer); if (q->perturb_period) { diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a4afde14e..c2fbde742 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -160,6 +160,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); + unsigned int len = 0, prev_len = qdisc_pkt_len(skb); int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); @@ -172,6 +173,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) nskb = segs->next; segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; + len += segs->len; ret = qdisc_enqueue(segs, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) @@ -183,7 +185,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) } sch->q.qlen += nb; if (nb > 1) - qdisc_tree_decrease_qlen(sch, 1 - nb); + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); consume_skb(skb); return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } @@ -399,7 +401,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); if (child) { - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; } @@ -502,13 +505,7 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 2bf8ec92d..e1849f371 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1263,7 +1263,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, if (score_curr > score_best) return curr; else if (score_curr == score_best) - return sctp_trans_elect_tie(curr, best); + return sctp_trans_elect_tie(best, curr); else return best; } @@ -1406,7 +1406,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { - sctp_transport_update_pmtu(sk, t, dst_mtu(t->dst)); + sctp_transport_update_pmtu(sk, t, + WORD_TRUNC(dst_mtu(t->dst))); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) @@ -1493,7 +1494,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) asoc->peer.sack_needed = 0; - sctp_outq_tail(&asoc->outqueue, sack); + sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC); /* Stop the SACK timer. */ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 1543e39f4..912eb1685 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -27,9 +27,9 @@ * Vlad Yasevich <vladislav.yasevich@hp.com> */ +#include <crypto/hash.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/scatterlist.h> #include <net/sctp/sctp.h> #include <net/sctp/auth.h> @@ -448,7 +448,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( */ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) { - struct crypto_hash *tfm = NULL; + struct crypto_shash *tfm = NULL; __u16 id; /* If AUTH extension is disabled, we are done */ @@ -462,9 +462,8 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) return 0; /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kzalloc( - sizeof(struct crypto_hash *) * SCTP_AUTH_NUM_HMACS, - gfp); + ep->auth_hmacs = kzalloc(sizeof(struct crypto_shash *) * + SCTP_AUTH_NUM_HMACS, gfp); if (!ep->auth_hmacs) return -ENOMEM; @@ -483,8 +482,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) continue; /* Allocate the ID */ - tfm = crypto_alloc_hash(sctp_hmac_list[id].hmac_name, 0, - CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0); if (IS_ERR(tfm)) goto out_err; @@ -500,7 +498,7 @@ out_err: } /* Destroy the hmac tfm array */ -void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[]) +void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]) { int i; @@ -508,8 +506,7 @@ void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[]) return; for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { - if (auth_hmacs[i]) - crypto_free_hash(auth_hmacs[i]); + crypto_free_shash(auth_hmacs[i]); } kfree(auth_hmacs); } @@ -709,8 +706,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sctp_auth_chunk *auth, gfp_t gfp) { - struct scatterlist sg; - struct hash_desc desc; + struct crypto_shash *tfm; struct sctp_auth_bytes *asoc_key; __u16 key_id, hmac_id; __u8 *digest; @@ -742,16 +738,22 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, /* set up scatter list */ end = skb_tail_pointer(skb); - sg_init_one(&sg, auth, end - (unsigned char *)auth); - desc.tfm = asoc->ep->auth_hmacs[hmac_id]; - desc.flags = 0; + tfm = asoc->ep->auth_hmacs[hmac_id]; digest = auth->auth_hdr.hmac; - if (crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len)) + if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len)) goto free; - crypto_hash_digest(&desc, &sg, sg.length, digest); + { + SHASH_DESC_ON_STACK(desc, tfm); + + desc->tfm = tfm; + desc->flags = 0; + crypto_shash_digest(desc, (u8 *)auth, + end - (unsigned char *)auth, digest); + shash_desc_zero(desc); + } free: if (free_key) diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 871cdf956..401c60750 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -111,7 +111,8 @@ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, dest->port = src->port; list_for_each_entry(addr, &src->address_list, list) { - error = sctp_add_bind_addr(dest, &addr->a, 1, gfp); + error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a), + 1, gfp); if (error < 0) break; } @@ -150,7 +151,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp) /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, - __u8 addr_state, gfp_t gfp) + int new_size, __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; @@ -159,7 +160,7 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, if (!addr) return -ENOMEM; - memcpy(&addr->a, new, sizeof(*new)); + memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size)); /* Fix up the port if it has not yet been set. * Both v4 and v6 have the port at the same offset. @@ -291,7 +292,8 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, } af->from_addr_param(&addr, rawaddr, htons(port), 0); - retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp); + retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), + SCTP_ADDR_SRC, gfp); if (retval) { /* Can't finish building the list, clean up. */ sctp_bind_addr_clean(bp); @@ -453,8 +455,8 @@ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) - error = sctp_add_bind_addr(dest, addr, SCTP_ADDR_SRC, - gfp); + error = sctp_add_bind_addr(dest, addr, sizeof(*addr), + SCTP_ADDR_SRC, gfp); } return error; diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a3380917f..958ef5f33 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) return msg; } -void sctp_datamsg_free(struct sctp_datamsg *msg) -{ - struct sctp_chunk *chunk; - - /* This doesn't have to be a _safe vairant because - * sctp_chunk_free() only drops the refs. - */ - list_for_each_entry(chunk, &msg->chunks, frag_list) - sctp_chunk_free(chunk); - - sctp_datamsg_put(msg); -} - /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { @@ -273,7 +260,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, frag |= SCTP_DATA_SACK_IMM; } - chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0); + chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, + 0, GFP_KERNEL); if (!chunk) { err = -ENOMEM; @@ -309,7 +297,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; - chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0); + chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, + 0, GFP_KERNEL); if (!chunk) { err = -ENOMEM; diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 2522a6175..9d494e35e 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -42,7 +42,6 @@ #include <linux/slab.h> #include <linux/in.h> #include <linux/random.h> /* get_random_bytes() */ -#include <linux/crypto.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/sctp/sctp.h> diff --git a/net/sctp/input.c b/net/sctp/input.c index 49d2cc751..00b844536 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -221,7 +221,7 @@ int sctp_rcv(struct sk_buff *skb) goto discard_release; /* Create an SCTP packet structure. */ - chunk = sctp_chunkify(skb, asoc, sk); + chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; @@ -606,7 +606,8 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) /* PMTU discovery (RFC1191) */ if (ICMP_FRAG_NEEDED == code) { - sctp_icmp_frag_needed(sk, asoc, transport, info); + sctp_icmp_frag_needed(sk, asoc, transport, + WORD_TRUNC(info)); goto out_unlock; } else { if (ICMP_PROT_UNREACH == code) { @@ -937,7 +938,6 @@ static struct sctp_association *__sctp_lookup_association( struct sctp_transport *t; struct sctp_association *asoc = NULL; - rcu_read_lock(); t = sctp_addrs_lookup_transport(net, local, peer); if (!t || !sctp_transport_hold(t)) goto out; @@ -949,7 +949,6 @@ static struct sctp_association *__sctp_lookup_association( sctp_transport_put(t); out: - rcu_read_unlock(); return asoc; } @@ -962,7 +961,9 @@ struct sctp_association *sctp_lookup_association(struct net *net, { struct sctp_association *asoc; + rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + rcu_read_unlock(); return asoc; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 9d610eddd..9844fe573 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -153,7 +153,7 @@ void sctp_packet_free(struct sctp_packet *packet) */ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, - int one_packet) + int one_packet, gfp_t gfp) { sctp_xmit_t retval; int error = 0; @@ -163,7 +163,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { - error = sctp_packet_transmit(packet); + error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; @@ -376,7 +376,7 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) * * The return value is a normal kernel error return value. */ -int sctp_packet_transmit(struct sctp_packet *packet) +int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; @@ -401,7 +401,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) sk = chunk->skb->sk; /* Allocate the new skb. */ - nskb = alloc_skb(packet->size + MAX_HEADER, GFP_ATOMIC); + nskb = alloc_skb(packet->size + MAX_HEADER, gfp); if (!nskb) goto nomem; @@ -523,8 +523,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) */ if (auth) sctp_auth_calculate_hmac(asoc, nskb, - (struct sctp_auth_chunk *)auth, - GFP_ATOMIC); + (struct sctp_auth_chunk *)auth, + gfp); /* 2) Calculate the Adler-32 checksum of the whole packet, * including the SCTP common header and all the @@ -705,7 +705,8 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, /* Check whether this chunk and all the rest of pending data will fit * or delay in hopes of bundling a full sized packet. */ - if (chunk->skb->len + q->out_qlen >= transport->pathmtu - packet->overhead) + if (chunk->skb->len + q->out_qlen > + transport->pathmtu - packet->overhead - sizeof(sctp_data_chunk_t) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index c0380cfb1..084718f9b 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q, static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout); +static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, @@ -285,7 +285,7 @@ void sctp_outq_free(struct sctp_outq *q) } /* Put a new chunk in an sctp_outq. */ -int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) +int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = sock_net(q->asoc->base.sk); int error = 0; @@ -341,7 +341,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) return error; if (!q->cork) - error = sctp_outq_flush(q, 0); + error = sctp_outq_flush(q, 0, gfp); return error; } @@ -510,7 +510,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) - error = sctp_outq_flush(q, /* rtx_timeout */ 1); + error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); if (error) q->asoc->base.sk->sk_err = -error; @@ -601,12 +601,12 @@ redo: * control chunks are already freed so there * is nothing we can do. */ - sctp_packet_transmit(pkt); + sctp_packet_transmit(pkt, GFP_ATOMIC); goto redo; } /* Send this packet. */ - error = sctp_packet_transmit(pkt); + error = sctp_packet_transmit(pkt, GFP_ATOMIC); /* If we are retransmitting, we should only * send a single packet. @@ -622,7 +622,7 @@ redo: case SCTP_XMIT_RWND_FULL: /* Send this packet. */ - error = sctp_packet_transmit(pkt); + error = sctp_packet_transmit(pkt, GFP_ATOMIC); /* Stop sending DATA as there is no more room * at the receiver. @@ -632,7 +632,7 @@ redo: case SCTP_XMIT_DELAY: /* Send this packet. */ - error = sctp_packet_transmit(pkt); + error = sctp_packet_transmit(pkt, GFP_ATOMIC); /* Stop sending DATA because of nagle delay. */ done = 1; @@ -685,12 +685,12 @@ redo: } /* Cork the outqueue so queued chunks are really queued. */ -int sctp_outq_uncork(struct sctp_outq *q) +int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; - return sctp_outq_flush(q, 0); + return sctp_outq_flush(q, 0, gfp); } @@ -703,7 +703,7 @@ int sctp_outq_uncork(struct sctp_outq *q) * locking concerns must be made. Today we use the sock lock to protect * this function. */ -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) +static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_packet *packet; struct sctp_packet singleton; @@ -825,7 +825,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); sctp_packet_append_chunk(&singleton, chunk); - error = sctp_packet_transmit(&singleton); + error = sctp_packet_transmit(&singleton, gfp); if (error < 0) return error; break; @@ -856,7 +856,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: status = sctp_packet_transmit_chunk(packet, chunk, - one_packet); + one_packet, gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &q->control_chunk_list); @@ -866,8 +866,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) * sender MUST assure that at least one T3-rtx * timer is running. */ - if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) - sctp_transport_reset_timers(transport); + if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { + sctp_transport_reset_t3_rtx(transport); + transport->last_time_sent = jiffies; + } } break; @@ -924,8 +926,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) error = sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer); - if (start_timer) - sctp_transport_reset_timers(transport); + if (start_timer) { + sctp_transport_reset_t3_rtx(transport); + transport->last_time_sent = jiffies; + } /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. @@ -978,8 +982,12 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) (new_transport->state == SCTP_UNCONFIRMED) || (new_transport->state == SCTP_PF))) new_transport = asoc->peer.active_path; - if (new_transport->state == SCTP_UNCONFIRMED) + if (new_transport->state == SCTP_UNCONFIRMED) { + WARN_ONCE(1, "Atempt to send packet on unconfirmed path."); + sctp_chunk_fail(chunk, 0); + sctp_chunk_free(chunk); continue; + } /* Change packets if necessary. */ if (new_transport != transport) { @@ -1011,7 +1019,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) atomic_read(&chunk->skb->users) : -1); /* Add the chunk to the packet. */ - status = sctp_packet_transmit_chunk(packet, chunk, 0); + status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp); switch (status) { case SCTP_XMIT_PMTU_FULL: @@ -1058,7 +1066,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) list_add_tail(&chunk->transmitted_list, &transport->transmitted); - sctp_transport_reset_timers(transport); + sctp_transport_reset_t3_rtx(transport); + transport->last_time_sent = jiffies; /* Only let one DATA chunk get bundled with a * COOKIE-ECHO chunk. @@ -1088,7 +1097,7 @@ sctp_flush_out: send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) - error = sctp_packet_transmit(packet); + error = sctp_packet_transmit(packet, gfp); /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); diff --git a/net/sctp/probe.c b/net/sctp/probe.c index 5e68b94ee..6cc2152e0 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -65,7 +65,7 @@ static struct { struct kfifo fifo; spinlock_t lock; wait_queue_head_t wait; - struct timespec tstart; + struct timespec64 tstart; } sctpw; static __printf(1, 2) void printl(const char *fmt, ...) @@ -85,7 +85,7 @@ static __printf(1, 2) void printl(const char *fmt, ...) static int sctpprobe_open(struct inode *inode, struct file *file) { kfifo_reset(&sctpw.fifo); - getnstimeofday(&sctpw.tstart); + ktime_get_ts64(&sctpw.tstart); return 0; } @@ -138,7 +138,7 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net, struct sk_buff *skb = chunk->skb; struct sctp_transport *sp; static __u32 lcwnd = 0; - struct timespec now; + struct timespec64 now; sp = asoc->peer.primary_path; @@ -149,8 +149,8 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net, (full || sp->cwnd != lcwnd)) { lcwnd = sp->cwnd; - getnstimeofday(&now); - now = timespec_sub(now, sctpw.tstart); + ktime_get_ts64(&now); + now = timespec64_sub(now, sctpw.tstart); printl("%lu.%06lu ", (unsigned long) now.tv_sec, (unsigned long) now.tv_nsec / NSEC_PER_USEC); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 963dffcc2..5cfac8d5d 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -161,7 +161,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa struct sctp_af *af; primary = &assoc->peer.primary_addr; - rcu_read_lock(); list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; @@ -172,7 +171,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa } af->seq_dump_addr(seq, addr); } - rcu_read_unlock(); } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1099e99a5..d3d50daa2 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -216,6 +216,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, (copy_flags & SCTP_ADDR6_ALLOWED) && (copy_flags & SCTP_ADDR6_PEERSUPP)))) { error = sctp_add_bind_addr(bp, &addr->a, + sizeof(addr->a), SCTP_ADDR_SRC, GFP_ATOMIC); if (error) goto end_copy; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5d6a03fad..56f364d8f 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -45,6 +45,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -52,7 +53,6 @@ #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> -#include <linux/crypto.h> #include <linux/slab.h> #include <net/sock.h> @@ -62,11 +62,13 @@ #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen); + __u8 type, __u8 flags, int paylen, + gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, - __u8 flags, int paylen); + __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen); + __u8 type, __u8 flags, int paylen, + gfp_t gfp); static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, @@ -318,7 +320,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, * PLEASE DO NOT FIXME [This version does not support Host Name.] */ - retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize); + retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; @@ -465,7 +467,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, num_ext); /* Now allocate and fill out the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize); + retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; @@ -570,7 +572,8 @@ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len); + retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, + cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = @@ -615,7 +618,7 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, { struct sctp_chunk *retval; - retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0); + retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -664,7 +667,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, - sizeof(sctp_cwrhdr_t)); + sizeof(sctp_cwrhdr_t), GFP_ATOMIC); if (!retval) goto nodata; @@ -698,7 +701,7 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, - sizeof(sctp_ecnehdr_t)); + sizeof(sctp_ecnehdr_t), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = @@ -713,7 +716,8 @@ nodata: */ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, - int data_len, __u8 flags, __u16 ssn) + int data_len, __u8 flags, __u16 ssn, + gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; @@ -734,7 +738,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, dp.ssn = htons(ssn); chunk_len = sizeof(dp) + data_len; - retval = sctp_make_data(asoc, flags, chunk_len); + retval = sctp_make_data(asoc, flags, chunk_len, gfp); if (!retval) goto nodata; @@ -781,7 +785,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len); + retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; @@ -861,7 +865,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, - sizeof(sctp_shutdownhdr_t)); + sizeof(sctp_shutdownhdr_t), GFP_ATOMIC); if (!retval) goto nodata; @@ -879,7 +883,8 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, { struct sctp_chunk *retval; - retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0); + retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, + GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -908,7 +913,8 @@ struct sctp_chunk *sctp_make_shutdown_complete( */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; - retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0); + retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, + 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -947,7 +953,8 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, flags = SCTP_CHUNK_FLAG_T; } - retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint); + retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, + GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * @@ -1139,7 +1146,8 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, struct sctp_chunk *retval; sctp_sender_hb_info_t hbinfo; - retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo)); + retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, + sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; @@ -1167,7 +1175,8 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, { struct sctp_chunk *retval; - retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen); + retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, + GFP_ATOMIC); if (!retval) goto nodata; @@ -1200,7 +1209,7 @@ static struct sctp_chunk *sctp_make_op_error_space( struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, - sizeof(sctp_errhdr_t) + size); + sizeof(sctp_errhdr_t) + size, GFP_ATOMIC); if (!retval) goto nodata; @@ -1271,7 +1280,8 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, - hmac_desc->hmac_len + sizeof(sctp_authhdr_t)); + hmac_desc->hmac_len + sizeof(sctp_authhdr_t), + GFP_ATOMIC); if (!retval) return NULL; @@ -1309,11 +1319,11 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, - struct sock *sk) + struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; - retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC); + retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; @@ -1361,7 +1371,8 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen) + __u8 type, __u8 flags, int paylen, + gfp_t gfp) { struct sctp_chunk *retval; sctp_chunkhdr_t *chunk_hdr; @@ -1369,8 +1380,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, struct sock *sk; /* No need to allocate LL here, as this is only a chunk. */ - skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), - GFP_ATOMIC); + skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp); if (!skb) goto nodata; @@ -1381,7 +1391,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t)); sk = asoc ? asoc->base.sk : NULL; - retval = sctp_chunkify(skb, asoc, sk); + retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; @@ -1400,16 +1410,18 @@ nodata: } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, - __u8 flags, int paylen) + __u8 flags, int paylen, gfp_t gfp) { - return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen); + return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, - __u8 type, __u8 flags, int paylen) + __u8 type, __u8 flags, int paylen, + gfp_t gfp) { - struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen); + struct sctp_chunk *chunk; + chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); @@ -1606,7 +1618,6 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, { sctp_cookie_param_t *retval; struct sctp_signed_cookie *cookie; - struct scatterlist sg; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including @@ -1663,16 +1674,19 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); if (sctp_sk(ep->base.sk)->hmac) { - struct hash_desc desc; + SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac); + int err; /* Sign the message. */ - sg_init_one(&sg, &cookie->c, bodysize); - desc.tfm = sctp_sk(ep->base.sk)->hmac; - desc.flags = 0; - - if (crypto_hash_setkey(desc.tfm, ep->secret_key, - sizeof(ep->secret_key)) || - crypto_hash_digest(&desc, &sg, bodysize, cookie->signature)) + desc->tfm = sctp_sk(ep->base.sk)->hmac; + desc->flags = 0; + + err = crypto_shash_setkey(desc->tfm, ep->secret_key, + sizeof(ep->secret_key)) ?: + crypto_shash_digest(desc, (u8 *)&cookie->c, bodysize, + cookie->signature); + shash_desc_zero(desc); + if (err) goto free_cookie; } @@ -1697,12 +1711,10 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_cookie *bear_cookie; int headersize, bodysize, fixed_size; __u8 *digest = ep->digest; - struct scatterlist sg; unsigned int len; sctp_scope_t scope; struct sk_buff *skb = chunk->skb; ktime_t kt; - struct hash_desc desc; /* Header size is static data prior to the actual cookie, including * any padding. @@ -1733,16 +1745,23 @@ struct sctp_association *sctp_unpack_cookie( goto no_hmac; /* Check the signature. */ - sg_init_one(&sg, bear_cookie, bodysize); - desc.tfm = sctp_sk(ep->base.sk)->hmac; - desc.flags = 0; - - memset(digest, 0x00, SCTP_SIGNATURE_SIZE); - if (crypto_hash_setkey(desc.tfm, ep->secret_key, - sizeof(ep->secret_key)) || - crypto_hash_digest(&desc, &sg, bodysize, digest)) { - *error = -SCTP_IERROR_NOMEM; - goto fail; + { + SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac); + int err; + + desc->tfm = sctp_sk(ep->base.sk)->hmac; + desc->flags = 0; + + err = crypto_shash_setkey(desc->tfm, ep->secret_key, + sizeof(ep->secret_key)) ?: + crypto_shash_digest(desc, (u8 *)bear_cookie, bodysize, + digest); + shash_desc_zero(desc); + + if (err) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { @@ -1830,7 +1849,8 @@ no_hmac: /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, - SCTP_ADDR_SRC, GFP_ATOMIC); + sizeof(chunk->dest), SCTP_ADDR_SRC, + GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; @@ -2756,7 +2776,8 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, length += addrlen; /* Create the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length); + retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, + GFP_ATOMIC); if (!retval) return NULL; @@ -2940,7 +2961,8 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ - retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length); + retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, + GFP_ATOMIC); if (!retval) return NULL; @@ -3058,8 +3080,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ - if (!mod_timer(&peer->hb_timer, sctp_transport_timeout(peer))) - sctp_transport_hold(peer); + sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: @@ -3500,7 +3521,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, hint = (nstreams + 1) * sizeof(__u32); - retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint); + retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b5327bb77..41b081a64 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -69,8 +69,6 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, sctp_cmd_seq_t *commands, gfp_t gfp); -static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds, - struct sctp_transport *t); /******************************************************************** * Helper functions ********************************************************************/ @@ -215,10 +213,14 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { + __u32 old_a_rwnd = asoc->a_rwnd; + asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); - if (!sack) + if (!sack) { + asoc->a_rwnd = old_a_rwnd; goto nomem; + } asoc->peer.sack_needed = 0; asoc->peer.sack_cnt = 0; @@ -363,6 +365,7 @@ void sctp_generate_heartbeat_event(unsigned long data) struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); + u32 elapsed, timeout; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -374,6 +377,16 @@ void sctp_generate_heartbeat_event(unsigned long data) goto out_unlock; } + /* Check if we should still send the heartbeat or reschedule */ + elapsed = jiffies - transport->last_time_sent; + timeout = sctp_transport_timeout(transport); + if (elapsed < timeout) { + elapsed = timeout - elapsed; + if (!mod_timer(&transport->hb_timer, jiffies + elapsed)) + sctp_transport_hold(transport); + goto out_unlock; + } + error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), asoc->state, asoc->ep, asoc, @@ -503,7 +516,7 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands, 0); /* Update the hb timer to resend a heartbeat every rto */ - sctp_cmd_hb_timer_update(commands, transport); + sctp_transport_reset_hb_timer(transport); } if (transport->state != SCTP_INACTIVE && @@ -630,11 +643,8 @@ static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds, * hold a reference on the transport to make sure none of * the needed data structures go away. */ - list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { - - if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) - sctp_transport_hold(t); - } + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) + sctp_transport_reset_hb_timer(t); } static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds, @@ -665,15 +675,6 @@ static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds, } -/* Helper function to update the heartbeat timer. */ -static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds, - struct sctp_transport *t) -{ - /* Update the heartbeat timer. */ - if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) - sctp_transport_hold(t); -} - /* Helper function to handle the reception of an HEARTBEAT ACK. */ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, struct sctp_association *asoc, @@ -738,8 +739,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at)); /* Update the heartbeat timer. */ - if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) - sctp_transport_hold(t); + sctp_transport_reset_hb_timer(t); if (was_unconfirmed && asoc->peer.transport_count == 1) sctp_transport_immediate_rtx(t); @@ -1019,13 +1019,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, * encouraged for small fragments. */ static int sctp_cmd_send_msg(struct sctp_association *asoc, - struct sctp_datamsg *msg) + struct sctp_datamsg *msg, gfp_t gfp) { struct sctp_chunk *chunk; int error = 0; list_for_each_entry(chunk, &msg->chunks, frag_list) { - error = sctp_outq_tail(&asoc->outqueue, chunk); + error = sctp_outq_tail(&asoc->outqueue, chunk, gfp); if (error) break; } @@ -1249,7 +1249,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_NEW_ASOC: /* Register a new association. */ if (local_cork) { - sctp_outq_uncork(&asoc->outqueue); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } @@ -1269,7 +1269,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_DELETE_TCB: if (local_cork) { - sctp_outq_uncork(&asoc->outqueue); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Delete the current association. */ @@ -1423,13 +1423,14 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, local_cork = 1; } /* Send a chunk to our peer. */ - error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk); + error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, + gfp); break; case SCTP_CMD_SEND_PKT: /* Send a full packet to our peer. */ packet = cmd->obj.packet; - sctp_packet_transmit(packet); + sctp_packet_transmit(packet, gfp); sctp_ootb_pkt_free(packet); break; @@ -1609,7 +1610,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_HB_TIMER_UPDATE: t = cmd->obj.transport; - sctp_cmd_hb_timer_update(commands, t); + sctp_transport_reset_hb_timer(t); break; case SCTP_CMD_HB_TIMERS_STOP: @@ -1639,7 +1640,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, */ chunk->pdiscard = 1; if (asoc) { - sctp_outq_uncork(&asoc->outqueue); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } break; @@ -1677,7 +1678,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; - error = sctp_outq_uncork(&asoc->outqueue); + error = sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; @@ -1704,7 +1705,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_outq_cork(&asoc->outqueue); local_cork = 1; } - error = sctp_cmd_send_msg(asoc, cmd->obj.msg); + error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; case SCTP_CMD_SEND_NEXT_ASCONF: sctp_cmd_send_asconf(asoc); @@ -1734,9 +1735,9 @@ out: */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) - error = sctp_outq_uncork(&asoc->outqueue); + error = sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) - error = sctp_outq_uncork(&asoc->outqueue); + error = sctp_outq_uncork(&asoc->outqueue, gfp); return error; nomem: error = -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e878da094..878d28eda 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -52,6 +52,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> @@ -61,7 +62,6 @@ #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> -#include <linux/crypto.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/compat.h> @@ -386,7 +386,8 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) /* Add the address to the bind address list. * Use GFP_ATOMIC since BHs will be disabled. */ - ret = sctp_add_bind_addr(bp, addr, SCTP_ADDR_SRC, GFP_ATOMIC); + ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len, + SCTP_ADDR_SRC, GFP_ATOMIC); /* Copy back into socket for getsockname() use. */ if (!ret) { @@ -577,6 +578,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk, af = sctp_get_af_specific(addr->v4.sin_family); memcpy(&saveaddr, addr, af->sockaddr_len); retval = sctp_add_bind_addr(bp, &saveaddr, + sizeof(saveaddr), SCTP_ADDR_NEW, GFP_ATOMIC); addr_buf += af->sockaddr_len; } @@ -1389,7 +1391,7 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, int err = 0; #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sctp_getaddrs_old param32; if (len < sizeof(param32)) @@ -4160,7 +4162,7 @@ static void sctp_destruct_sock(struct sock *sk) struct sctp_sock *sp = sctp_sk(sk); /* Free up the HMAC transform. */ - crypto_free_hash(sp->hmac); + crypto_free_shash(sp->hmac); inet_sock_destruct(sk); } @@ -6106,9 +6108,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, return retval; } -static void sctp_hash(struct sock *sk) +static int sctp_hash(struct sock *sk) { /* STUB */ + return 0; } static void sctp_unhash(struct sock *sk) @@ -6304,13 +6307,13 @@ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; - struct crypto_hash *tfm = NULL; + struct crypto_shash *tfm = NULL; char alg[32]; /* Allocate HMAC for generating cookie. */ if (!sp->hmac && sp->sctp_hmac_alg) { sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); - tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash(alg, 0, 0); if (IS_ERR(tfm)) { net_info_ratelimited("failed to load transform for %s: %ld\n", sp->sctp_hmac_alg, PTR_ERR(tfm)); @@ -7253,14 +7256,12 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), inet_sk(oldsk)->inet_num)]; - local_bh_disable(); - spin_lock(&head->lock); + spin_lock_bh(&head->lock); pp = sctp_sk(oldsk)->bind_hash; sk_add_bind_node(newsk, &pp->owner); sctp_sk(newsk)->bind_hash = pp; inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num; - spin_unlock(&head->lock); - local_bh_enable(); + spin_unlock_bh(&head->lock); /* Copy the bind_addr list from the original endpoint to the new * endpoint so that we can handle restarts properly diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a431c1404..81b86678b 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net, */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); - peer->last_time_heard = ktime_get(); + peer->last_time_heard = ktime_set(0, 0); peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | @@ -183,7 +183,7 @@ static void sctp_transport_destroy(struct sctp_transport *transport) /* Start T3_rtx timer if it is not already running and update the heartbeat * timer. This routine is called every time a DATA chunk is sent. */ -void sctp_transport_reset_timers(struct sctp_transport *transport) +void sctp_transport_reset_t3_rtx(struct sctp_transport *transport) { /* RFC 2960 6.3.2 Retransmission Timer Rules * @@ -197,11 +197,18 @@ void sctp_transport_reset_timers(struct sctp_transport *transport) if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); +} + +void sctp_transport_reset_hb_timer(struct sctp_transport *transport) +{ + unsigned long expires; /* When a data chunk is sent, reset the heartbeat interval. */ - if (!mod_timer(&transport->hb_timer, - sctp_transport_timeout(transport))) - sctp_transport_hold(transport); + expires = jiffies + sctp_transport_timeout(transport); + if (time_before(transport->hb_timer.expires, expires) && + !mod_timer(&transport->hb_timer, + expires + prandom_u32_max(transport->rto))) + sctp_transport_hold(transport); } /* This transport has been assigned to an association. @@ -226,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) } if (transport->dst) { - transport->pathmtu = dst_mtu(transport->dst); + transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); } else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } @@ -280,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport, return; } if (transport->dst) { - transport->pathmtu = dst_mtu(transport->dst); + transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). @@ -595,13 +602,13 @@ void sctp_transport_burst_reset(struct sctp_transport *t) unsigned long sctp_transport_timeout(struct sctp_transport *trans) { /* RTO + timer slack +/- 50% of RTO */ - unsigned long timeout = (trans->rto >> 1) + prandom_u32_max(trans->rto); + unsigned long timeout = trans->rto >> 1; if (trans->state != SCTP_UNCONFIRMED && trans->state != SCTP_PF) timeout += trans->hbinterval; - return timeout + jiffies; + return timeout; } /* Reset transport variables to their initial values */ diff --git a/net/socket.c b/net/socket.c index db13ae893..5f77a8e93 100644 --- a/net/socket.c +++ b/net/socket.c @@ -533,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = { * NULL is returned. */ -static struct socket *sock_alloc(void) +struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; @@ -554,6 +554,7 @@ static struct socket *sock_alloc(void) this_cpu_add(sockets_in_use, 1); return sock; } +EXPORT_SYMBOL(sock_alloc); /** * sock_release - close a socket @@ -1106,12 +1107,8 @@ int __sock_create(struct net *net, int family, int type, int protocol, deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { - static int warned; - if (!warned) { - warned = 1; - pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n", - current->comm); - } + pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n", + current->comm); family = PF_PACKET; } @@ -1874,7 +1871,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, - struct used_address *used_address) + struct used_address *used_address, + unsigned int allowed_msghdr_flags) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; @@ -1900,6 +1898,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, if (msg_sys->msg_controllen > INT_MAX) goto out_freeiov; + flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = @@ -1978,7 +1977,7 @@ long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags) if (!sock) goto out; - err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); fput_light(sock->file, fput_needed); out: @@ -2005,6 +2004,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct used_address used_address; + unsigned int oflags = flags; if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; @@ -2019,11 +2019,15 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; + flags |= MSG_BATCH; while (datagrams < vlen) { + if (datagrams == vlen - 1) + flags = oflags; + if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, - &msg_sys, flags, &used_address); + &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); @@ -2031,7 +2035,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, } else { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)entry, - &msg_sys, flags, &used_address); + &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = put_user(err, &entry->msg_len); diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index b512fbd9d..ea7ffa12e 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -12,7 +12,8 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ - svc_xprt.o + svc_xprt.o \ + xprtmultipath.o sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o sunrpc-$(CONFIG_PROC_FS) += stats.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index cabf586f4..15612ffa8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1181,12 +1181,12 @@ static struct rpc_auth * gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; - struct rpc_xprt *xprt = rcu_access_pointer(clnt->cl_xprt); + struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (clnt != clnt->cl_parent) { struct rpc_clnt *parent = clnt->cl_parent; /* Find the original parent for this transport */ - if (rcu_access_pointer(parent->cl_xprt) != xprt) + if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; clnt = parent; } @@ -1728,8 +1728,8 @@ alloc_enc_pages(struct rpc_rqst *rqstp) return 0; } - first = snd_buf->page_base >> PAGE_CACHE_SHIFT; - last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; + first = snd_buf->page_base >> PAGE_SHIFT; + last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), @@ -1775,10 +1775,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, status = alloc_enc_pages(rqstp); if (status) return status; - first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + first = snd_buf->page_base >> PAGE_SHIFT; inpages = snd_buf->pages + first; snd_buf->pages = rqstp->rq_enc_pages; - snd_buf->page_base -= first << PAGE_CACHE_SHIFT; + snd_buf->page_base -= first << PAGE_SHIFT; /* * Give the tail its own page, in case we need extra space in the * head when wrapping: diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index fee3c15a4..244245bcb 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -34,11 +34,12 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/err.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/scatterlist.h> -#include <linux/crypto.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/random.h> @@ -51,7 +52,7 @@ u32 krb5_encrypt( - struct crypto_blkcipher *tfm, + struct crypto_skcipher *tfm, void * iv, void * in, void * out, @@ -60,24 +61,29 @@ krb5_encrypt( u32 ret = -EINVAL; struct scatterlist sg[1]; u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + SKCIPHER_REQUEST_ON_STACK(req, tfm); - if (length % crypto_blkcipher_blocksize(tfm) != 0) + if (length % crypto_skcipher_blocksize(tfm) != 0) goto out; - if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", - crypto_blkcipher_ivsize(tfm)); + crypto_skcipher_ivsize(tfm)); goto out; } if (iv) - memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm)); + memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm)); memcpy(out, in, length); sg_init_one(sg, out, length); - ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, length); + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, length, local_iv); + + ret = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); out: dprintk("RPC: krb5_encrypt returns %d\n", ret); return ret; @@ -85,7 +91,7 @@ out: u32 krb5_decrypt( - struct crypto_blkcipher *tfm, + struct crypto_skcipher *tfm, void * iv, void * in, void * out, @@ -94,23 +100,28 @@ krb5_decrypt( u32 ret = -EINVAL; struct scatterlist sg[1]; u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + SKCIPHER_REQUEST_ON_STACK(req, tfm); - if (length % crypto_blkcipher_blocksize(tfm) != 0) + if (length % crypto_skcipher_blocksize(tfm) != 0) goto out; - if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", - crypto_blkcipher_ivsize(tfm)); + crypto_skcipher_ivsize(tfm)); goto out; } if (iv) - memcpy(local_iv,iv, crypto_blkcipher_ivsize(tfm)); + memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm)); memcpy(out, in, length); sg_init_one(sg, out, length); - ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, length); + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, length, local_iv); + + ret = crypto_skcipher_decrypt(req); + skcipher_request_zero(req); out: dprintk("RPC: gss_k5decrypt returns %d\n",ret); return ret; @@ -119,9 +130,11 @@ out: static int checksummer(struct scatterlist *sg, void *data) { - struct hash_desc *desc = data; + struct ahash_request *req = data; + + ahash_request_set_crypt(req, sg, NULL, sg->length); - return crypto_hash_update(desc, sg, sg->length); + return crypto_ahash_update(req); } static int @@ -152,13 +165,13 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, struct xdr_buf *body, int body_offset, u8 *cksumkey, unsigned int usage, struct xdr_netobj *cksumout) { - struct hash_desc desc; struct scatterlist sg[1]; int err; u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; u8 rc4salt[4]; - struct crypto_hash *md5; - struct crypto_hash *hmac_md5; + struct crypto_ahash *md5; + struct crypto_ahash *hmac_md5; + struct ahash_request *req; if (cksumkey == NULL) return GSS_S_FAILURE; @@ -174,61 +187,79 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } - md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(md5)) return GSS_S_FAILURE; - hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, - CRYPTO_ALG_ASYNC); + hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(hmac_md5)) { - crypto_free_hash(md5); + crypto_free_ahash(md5); + return GSS_S_FAILURE; + } + + req = ahash_request_alloc(md5, GFP_KERNEL); + if (!req) { + crypto_free_ahash(hmac_md5); + crypto_free_ahash(md5); return GSS_S_FAILURE; } - desc.tfm = md5; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - err = crypto_hash_init(&desc); + err = crypto_ahash_init(req); if (err) goto out; sg_init_one(sg, rc4salt, 4); - err = crypto_hash_update(&desc, sg, 4); + ahash_request_set_crypt(req, sg, NULL, 4); + err = crypto_ahash_update(req); if (err) goto out; sg_init_one(sg, header, hdrlen); - err = crypto_hash_update(&desc, sg, hdrlen); + ahash_request_set_crypt(req, sg, NULL, hdrlen); + err = crypto_ahash_update(req); if (err) goto out; err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, &desc); + checksummer, req); if (err) goto out; - err = crypto_hash_final(&desc, checksumdata); + ahash_request_set_crypt(req, NULL, checksumdata, 0); + err = crypto_ahash_final(req); if (err) goto out; - desc.tfm = hmac_md5; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ahash_request_free(req); + req = ahash_request_alloc(hmac_md5, GFP_KERNEL); + if (!req) { + crypto_free_ahash(hmac_md5); + crypto_free_ahash(md5); + return GSS_S_FAILURE; + } + + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - err = crypto_hash_init(&desc); + err = crypto_ahash_init(req); if (err) goto out; - err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); + err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); if (err) goto out; - sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5)); - err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5), - checksumdata); + sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5)); + ahash_request_set_crypt(req, sg, checksumdata, + crypto_ahash_digestsize(md5)); + err = crypto_ahash_digest(req); if (err) goto out; memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); cksumout->len = kctx->gk5e->cksumlength; out: - crypto_free_hash(md5); - crypto_free_hash(hmac_md5); + ahash_request_free(req); + crypto_free_ahash(md5); + crypto_free_ahash(hmac_md5); return err ? GSS_S_FAILURE : 0; } @@ -242,7 +273,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, struct xdr_buf *body, int body_offset, u8 *cksumkey, unsigned int usage, struct xdr_netobj *cksumout) { - struct hash_desc desc; + struct crypto_ahash *tfm; + struct ahash_request *req; struct scatterlist sg[1]; int err; u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; @@ -259,32 +291,41 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } - desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(desc.tfm)) + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) return GSS_S_FAILURE; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - checksumlen = crypto_hash_digestsize(desc.tfm); + req = ahash_request_alloc(tfm, GFP_KERNEL); + if (!req) { + crypto_free_ahash(tfm); + return GSS_S_FAILURE; + } + + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + + checksumlen = crypto_ahash_digestsize(tfm); if (cksumkey != NULL) { - err = crypto_hash_setkey(desc.tfm, cksumkey, - kctx->gk5e->keylength); + err = crypto_ahash_setkey(tfm, cksumkey, + kctx->gk5e->keylength); if (err) goto out; } - err = crypto_hash_init(&desc); + err = crypto_ahash_init(req); if (err) goto out; sg_init_one(sg, header, hdrlen); - err = crypto_hash_update(&desc, sg, hdrlen); + ahash_request_set_crypt(req, sg, NULL, hdrlen); + err = crypto_ahash_update(req); if (err) goto out; err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, &desc); + checksummer, req); if (err) goto out; - err = crypto_hash_final(&desc, checksumdata); + ahash_request_set_crypt(req, NULL, checksumdata, 0); + err = crypto_ahash_final(req); if (err) goto out; @@ -307,7 +348,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, } cksumout->len = kctx->gk5e->cksumlength; out: - crypto_free_hash(desc.tfm); + ahash_request_free(req); + crypto_free_ahash(tfm); return err ? GSS_S_FAILURE : 0; } @@ -323,7 +365,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, struct xdr_buf *body, int body_offset, u8 *cksumkey, unsigned int usage, struct xdr_netobj *cksumout) { - struct hash_desc desc; + struct crypto_ahash *tfm; + struct ahash_request *req; struct scatterlist sg[1]; int err; u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; @@ -340,31 +383,39 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } - desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(desc.tfm)) + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) return GSS_S_FAILURE; - checksumlen = crypto_hash_digestsize(desc.tfm); - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + checksumlen = crypto_ahash_digestsize(tfm); + + req = ahash_request_alloc(tfm, GFP_KERNEL); + if (!req) { + crypto_free_ahash(tfm); + return GSS_S_FAILURE; + } - err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + + err = crypto_ahash_setkey(tfm, cksumkey, kctx->gk5e->keylength); if (err) goto out; - err = crypto_hash_init(&desc); + err = crypto_ahash_init(req); if (err) goto out; err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, &desc); + checksummer, req); if (err) goto out; if (header != NULL) { sg_init_one(sg, header, hdrlen); - err = crypto_hash_update(&desc, sg, hdrlen); + ahash_request_set_crypt(req, sg, NULL, hdrlen); + err = crypto_ahash_update(req); if (err) goto out; } - err = crypto_hash_final(&desc, checksumdata); + ahash_request_set_crypt(req, NULL, checksumdata, 0); + err = crypto_ahash_final(req); if (err) goto out; @@ -381,13 +432,14 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, break; } out: - crypto_free_hash(desc.tfm); + ahash_request_free(req); + crypto_free_ahash(tfm); return err ? GSS_S_FAILURE : 0; } struct encryptor_desc { u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; - struct blkcipher_desc desc; + struct skcipher_request *req; int pos; struct xdr_buf *outbuf; struct page **pages; @@ -402,6 +454,7 @@ encryptor(struct scatterlist *sg, void *data) { struct encryptor_desc *desc = data; struct xdr_buf *outbuf = desc->outbuf; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); struct page *in_page; int thislen = desc->fraglen + sg->length; int fraglen, ret; @@ -414,7 +467,7 @@ encryptor(struct scatterlist *sg, void *data) page_pos = desc->pos - outbuf->head[0].iov_len; if (page_pos >= 0 && page_pos < outbuf->page_len) { /* pages are not in place: */ - int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; + int i = (page_pos + outbuf->page_base) >> PAGE_SHIFT; in_page = desc->pages[i]; } else { in_page = sg_page(sg); @@ -427,7 +480,7 @@ encryptor(struct scatterlist *sg, void *data) desc->fraglen += sg->length; desc->pos += sg->length; - fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); thislen -= fraglen; if (thislen == 0) @@ -436,8 +489,10 @@ encryptor(struct scatterlist *sg, void *data) sg_mark_end(&desc->infrags[desc->fragno - 1]); sg_mark_end(&desc->outfrags[desc->fragno - 1]); - ret = crypto_blkcipher_encrypt_iv(&desc->desc, desc->outfrags, - desc->infrags, thislen); + skcipher_request_set_crypt(desc->req, desc->infrags, desc->outfrags, + thislen, desc->iv); + + ret = crypto_skcipher_encrypt(desc->req); if (ret) return ret; @@ -459,18 +514,20 @@ encryptor(struct scatterlist *sg, void *data) } int -gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, +gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, int offset, struct page **pages) { int ret; struct encryptor_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, tfm); + + BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); - BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0); + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); memset(desc.iv, 0, sizeof(desc.iv)); - desc.desc.tfm = tfm; - desc.desc.info = desc.iv; - desc.desc.flags = 0; + desc.req = req; desc.pos = offset; desc.outbuf = buf; desc.pages = pages; @@ -481,12 +538,13 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, sg_init_table(desc.outfrags, 4); ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc); + skcipher_request_zero(req); return ret; } struct decryptor_desc { u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; - struct blkcipher_desc desc; + struct skcipher_request *req; struct scatterlist frags[4]; int fragno; int fraglen; @@ -497,6 +555,7 @@ decryptor(struct scatterlist *sg, void *data) { struct decryptor_desc *desc = data; int thislen = desc->fraglen + sg->length; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); int fraglen, ret; /* Worst case is 4 fragments: head, end of page 1, start @@ -507,7 +566,7 @@ decryptor(struct scatterlist *sg, void *data) desc->fragno++; desc->fraglen += sg->length; - fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); thislen -= fraglen; if (thislen == 0) @@ -515,8 +574,10 @@ decryptor(struct scatterlist *sg, void *data) sg_mark_end(&desc->frags[desc->fragno - 1]); - ret = crypto_blkcipher_decrypt_iv(&desc->desc, desc->frags, - desc->frags, thislen); + skcipher_request_set_crypt(desc->req, desc->frags, desc->frags, + thislen, desc->iv); + + ret = crypto_skcipher_decrypt(desc->req); if (ret) return ret; @@ -535,24 +596,29 @@ decryptor(struct scatterlist *sg, void *data) } int -gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, +gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, int offset) { + int ret; struct decryptor_desc desc; + SKCIPHER_REQUEST_ON_STACK(req, tfm); /* XXXJBF: */ - BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0); + BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); + + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); memset(desc.iv, 0, sizeof(desc.iv)); - desc.desc.tfm = tfm; - desc.desc.info = desc.iv; - desc.desc.flags = 0; + desc.req = req; desc.fragno = 0; desc.fraglen = 0; sg_init_table(desc.frags, 4); - return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); + ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); + skcipher_request_zero(req); + return ret; } /* @@ -594,12 +660,12 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) } static u32 -gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf, +gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, u32 offset, u8 *iv, struct page **pages, int encrypt) { u32 ret; struct scatterlist sg[1]; - struct blkcipher_desc desc = { .tfm = cipher, .info = iv }; + SKCIPHER_REQUEST_ON_STACK(req, cipher); u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2]; struct page **save_pages; u32 len = buf->len - offset; @@ -625,10 +691,16 @@ gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf, sg_init_one(sg, data, len); + skcipher_request_set_tfm(req, cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, sg, sg, len, iv); + if (encrypt) - ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + ret = crypto_skcipher_encrypt(req); else - ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len); + ret = crypto_skcipher_decrypt(req); + + skcipher_request_zero(req); if (ret) goto out; @@ -647,7 +719,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_netobj hmac; u8 *cksumkey; u8 *ecptr; - struct crypto_blkcipher *cipher, *aux_cipher; + struct crypto_skcipher *cipher, *aux_cipher; int blocksize; struct page **save_pages; int nblocks, nbytes; @@ -666,7 +738,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, cksumkey = kctx->acceptor_integ; usage = KG_USAGE_ACCEPTOR_SEAL; } - blocksize = crypto_blkcipher_blocksize(cipher); + blocksize = crypto_skcipher_blocksize(cipher); /* hide the gss token header and insert the confounder */ offset += GSS_KRB5_TOK_HDR_LEN; @@ -719,20 +791,24 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, memset(desc.iv, 0, sizeof(desc.iv)); if (cbcbytes) { + SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; desc.fragno = 0; desc.fraglen = 0; desc.pages = pages; desc.outbuf = buf; - desc.desc.info = desc.iv; - desc.desc.flags = 0; - desc.desc.tfm = aux_cipher; + desc.req = req; + + skcipher_request_set_tfm(req, aux_cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); sg_init_table(desc.infrags, 4); sg_init_table(desc.outfrags, 4); err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN, cbcbytes, encryptor, &desc); + skcipher_request_zero(req); if (err) goto out_err; } @@ -763,7 +839,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct xdr_buf subbuf; u32 ret = 0; u8 *cksum_key; - struct crypto_blkcipher *cipher, *aux_cipher; + struct crypto_skcipher *cipher, *aux_cipher; struct xdr_netobj our_hmac_obj; u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; @@ -782,7 +858,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, cksum_key = kctx->initiator_integ; usage = KG_USAGE_INITIATOR_SEAL; } - blocksize = crypto_blkcipher_blocksize(cipher); + blocksize = crypto_skcipher_blocksize(cipher); /* create a segment skipping the header and leaving out the checksum */ @@ -799,15 +875,19 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, memset(desc.iv, 0, sizeof(desc.iv)); if (cbcbytes) { + SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + desc.fragno = 0; desc.fraglen = 0; - desc.desc.info = desc.iv; - desc.desc.flags = 0; - desc.desc.tfm = aux_cipher; + desc.req = req; + + skcipher_request_set_tfm(req, aux_cipher); + skcipher_request_set_callback(req, 0, NULL, NULL); sg_init_table(desc.frags, 4); ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc); + skcipher_request_zero(req); if (ret) goto out_err; } @@ -850,61 +930,63 @@ out_err: * Set the key of the given cipher. */ int -krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, +krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, unsigned char *cksum) { - struct crypto_hash *hmac; - struct hash_desc desc; - struct scatterlist sg[1]; + struct crypto_shash *hmac; + struct shash_desc *desc; u8 Kseq[GSS_KRB5_MAX_KEYLEN]; u32 zeroconstant = 0; int err; dprintk("%s: entered\n", __func__); - hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0); if (IS_ERR(hmac)) { dprintk("%s: error %ld, allocating hash '%s'\n", __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); return PTR_ERR(hmac); } - desc.tfm = hmac; - desc.flags = 0; + desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), + GFP_KERNEL); + if (!desc) { + dprintk("%s: failed to allocate shash descriptor for '%s'\n", + __func__, kctx->gk5e->cksum_name); + crypto_free_shash(hmac); + return -ENOMEM; + } - err = crypto_hash_init(&desc); - if (err) - goto out_err; + desc->tfm = hmac; + desc->flags = 0; /* Compute intermediate Kseq from session key */ - err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); + err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); if (err) goto out_err; - sg_init_one(sg, &zeroconstant, 4); - err = crypto_hash_digest(&desc, sg, 4, Kseq); + err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq); if (err) goto out_err; /* Compute final Kseq from the checksum and intermediate Kseq */ - err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength); + err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength); if (err) goto out_err; - sg_set_buf(sg, cksum, 8); - - err = crypto_hash_digest(&desc, sg, 8, Kseq); + err = crypto_shash_digest(desc, cksum, 8, Kseq); if (err) goto out_err; - err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); + err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); if (err) goto out_err; err = 0; out_err: - crypto_free_hash(hmac); + kzfree(desc); + crypto_free_shash(hmac); dprintk("%s: returning %d\n", __func__, err); return err; } @@ -914,12 +996,11 @@ out_err: * Set the key of cipher kctx->enc. */ int -krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, +krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, s32 seqnum) { - struct crypto_hash *hmac; - struct hash_desc desc; - struct scatterlist sg[1]; + struct crypto_shash *hmac; + struct shash_desc *desc; u8 Kcrypt[GSS_KRB5_MAX_KEYLEN]; u8 zeroconstant[4] = {0}; u8 seqnumarray[4]; @@ -927,35 +1008,39 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, dprintk("%s: entered, seqnum %u\n", __func__, seqnum); - hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0); if (IS_ERR(hmac)) { dprintk("%s: error %ld, allocating hash '%s'\n", __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); return PTR_ERR(hmac); } - desc.tfm = hmac; - desc.flags = 0; + desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), + GFP_KERNEL); + if (!desc) { + dprintk("%s: failed to allocate shash descriptor for '%s'\n", + __func__, kctx->gk5e->cksum_name); + crypto_free_shash(hmac); + return -ENOMEM; + } - err = crypto_hash_init(&desc); - if (err) - goto out_err; + desc->tfm = hmac; + desc->flags = 0; /* Compute intermediate Kcrypt from session key */ for (i = 0; i < kctx->gk5e->keylength; i++) Kcrypt[i] = kctx->Ksess[i] ^ 0xf0; - err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); + err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); if (err) goto out_err; - sg_init_one(sg, zeroconstant, 4); - err = crypto_hash_digest(&desc, sg, 4, Kcrypt); + err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt); if (err) goto out_err; /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */ - err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); + err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); if (err) goto out_err; @@ -964,20 +1049,19 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff); seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff); - sg_set_buf(sg, seqnumarray, 4); - - err = crypto_hash_digest(&desc, sg, 4, Kcrypt); + err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt); if (err) goto out_err; - err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); + err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); if (err) goto out_err; err = 0; out_err: - crypto_free_hash(hmac); + kzfree(desc); + crypto_free_shash(hmac); dprintk("%s: returning %d\n", __func__, err); return err; } diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 234fa8d0f..870133146 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -54,9 +54,9 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ +#include <crypto/skcipher.h> #include <linux/err.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> #include <linux/lcm.h> @@ -147,7 +147,7 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, size_t blocksize, keybytes, keylength, n; unsigned char *inblockdata, *outblockdata, *rawkey; struct xdr_netobj inblock, outblock; - struct crypto_blkcipher *cipher; + struct crypto_skcipher *cipher; u32 ret = EINVAL; blocksize = gk5e->blocksize; @@ -157,11 +157,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, if ((inkey->len != keylength) || (outkey->len != keylength)) goto err_return; - cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(cipher)) goto err_return; - if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len)) + if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len)) goto err_return; /* allocate and set up buffers */ @@ -238,7 +238,7 @@ err_free_in: memset(inblockdata, 0, blocksize); kfree(inblockdata); err_free_cipher: - crypto_free_blkcipher(cipher); + crypto_free_skcipher(cipher); err_return: return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 28db442a0..65427492b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -34,6 +34,8 @@ * */ +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/err.h> #include <linux/module.h> #include <linux/init.h> @@ -42,7 +44,6 @@ #include <linux/sunrpc/auth.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> -#include <linux/crypto.h> #include <linux/sunrpc/gss_krb5_enctypes.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -217,7 +218,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) static inline const void * get_key(const void *p, const void *end, - struct krb5_ctx *ctx, struct crypto_blkcipher **res) + struct krb5_ctx *ctx, struct crypto_skcipher **res) { struct xdr_netobj key; int alg; @@ -245,7 +246,7 @@ get_key(const void *p, const void *end, if (IS_ERR(p)) goto out_err; - *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, + *res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(*res)) { printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " @@ -253,7 +254,7 @@ get_key(const void *p, const void *end, *res = NULL; goto out_err_free_key; } - if (crypto_blkcipher_setkey(*res, key.data, key.len)) { + if (crypto_skcipher_setkey(*res, key.data, key.len)) { printk(KERN_WARNING "gss_kerberos_mech: error setting key for " "crypto algorithm %s\n", ctx->gk5e->encrypt_name); goto out_err_free_tfm; @@ -263,7 +264,7 @@ get_key(const void *p, const void *end, return p; out_err_free_tfm: - crypto_free_blkcipher(*res); + crypto_free_skcipher(*res); out_err_free_key: kfree(key.data); p = ERR_PTR(-EINVAL); @@ -335,30 +336,30 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) return 0; out_err_free_key2: - crypto_free_blkcipher(ctx->seq); + crypto_free_skcipher(ctx->seq); out_err_free_key1: - crypto_free_blkcipher(ctx->enc); + crypto_free_skcipher(ctx->enc); out_err_free_mech: kfree(ctx->mech_used.data); out_err: return PTR_ERR(p); } -static struct crypto_blkcipher * +static struct crypto_skcipher * context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) { - struct crypto_blkcipher *cp; + struct crypto_skcipher *cp; - cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC); + cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(cp)) { dprintk("gss_kerberos_mech: unable to initialize " "crypto algorithm %s\n", cname); return NULL; } - if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) { + if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { dprintk("gss_kerberos_mech: error setting key for " "crypto algorithm %s\n", cname); - crypto_free_blkcipher(cp); + crypto_free_skcipher(cp); return NULL; } return cp; @@ -412,9 +413,9 @@ context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) return 0; out_free_enc: - crypto_free_blkcipher(ctx->enc); + crypto_free_skcipher(ctx->enc); out_free_seq: - crypto_free_blkcipher(ctx->seq); + crypto_free_skcipher(ctx->seq); out_err: return -EINVAL; } @@ -427,18 +428,17 @@ out_err: static int context_derive_keys_rc4(struct krb5_ctx *ctx) { - struct crypto_hash *hmac; + struct crypto_shash *hmac; char sigkeyconstant[] = "signaturekey"; int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ - struct hash_desc desc; - struct scatterlist sg[1]; + struct shash_desc *desc; int err; dprintk("RPC: %s: entered\n", __func__); /* * derive cksum (aka Ksign) key */ - hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0); if (IS_ERR(hmac)) { dprintk("%s: error %ld allocating hash '%s'\n", __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name); @@ -446,37 +446,41 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) goto out_err; } - err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); + err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); if (err) goto out_err_free_hmac; - sg_init_table(sg, 1); - sg_set_buf(sg, sigkeyconstant, slen); - desc.tfm = hmac; - desc.flags = 0; - - err = crypto_hash_init(&desc); - if (err) + desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), + GFP_KERNEL); + if (!desc) { + dprintk("%s: failed to allocate hash descriptor for '%s'\n", + __func__, ctx->gk5e->cksum_name); + err = -ENOMEM; goto out_err_free_hmac; + } + + desc->tfm = hmac; + desc->flags = 0; - err = crypto_hash_digest(&desc, sg, slen, ctx->cksum); + err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum); + kzfree(desc); if (err) goto out_err_free_hmac; /* - * allocate hash, and blkciphers for data and seqnum encryption + * allocate hash, and skciphers for data and seqnum encryption */ - ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(ctx->enc)) { err = PTR_ERR(ctx->enc); goto out_err_free_hmac; } - ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(ctx->seq)) { - crypto_free_blkcipher(ctx->enc); + crypto_free_skcipher(ctx->enc); err = PTR_ERR(ctx->seq); goto out_err_free_hmac; } @@ -486,7 +490,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) err = 0; out_err_free_hmac: - crypto_free_hash(hmac); + crypto_free_shash(hmac); out_err: dprintk("RPC: %s: returning %d\n", __func__, err); return err; @@ -588,7 +592,7 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) context_v2_alloc_cipher(ctx, "cbc(aes)", ctx->acceptor_seal); if (ctx->acceptor_enc_aux == NULL) { - crypto_free_blkcipher(ctx->initiator_enc_aux); + crypto_free_skcipher(ctx->initiator_enc_aux); goto out_free_acceptor_enc; } } @@ -596,9 +600,9 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) return 0; out_free_acceptor_enc: - crypto_free_blkcipher(ctx->acceptor_enc); + crypto_free_skcipher(ctx->acceptor_enc); out_free_initiator_enc: - crypto_free_blkcipher(ctx->initiator_enc); + crypto_free_skcipher(ctx->initiator_enc); out_err: return -EINVAL; } @@ -710,12 +714,12 @@ static void gss_delete_sec_context_kerberos(void *internal_ctx) { struct krb5_ctx *kctx = internal_ctx; - crypto_free_blkcipher(kctx->seq); - crypto_free_blkcipher(kctx->enc); - crypto_free_blkcipher(kctx->acceptor_enc); - crypto_free_blkcipher(kctx->initiator_enc); - crypto_free_blkcipher(kctx->acceptor_enc_aux); - crypto_free_blkcipher(kctx->initiator_enc_aux); + crypto_free_skcipher(kctx->seq); + crypto_free_skcipher(kctx->enc); + crypto_free_skcipher(kctx->acceptor_enc); + crypto_free_skcipher(kctx->initiator_enc); + crypto_free_skcipher(kctx->acceptor_enc_aux); + crypto_free_skcipher(kctx->initiator_enc_aux); kfree(kctx->mech_used.data); kfree(kctx); } diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index 20d55c793..c8b9082f4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -31,9 +31,9 @@ * PERFORMANCE OF THIS SOFTWARE. */ +#include <crypto/skcipher.h> #include <linux/types.h> #include <linux/sunrpc/gss_krb5.h> -#include <linux/crypto.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH @@ -43,13 +43,13 @@ static s32 krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, unsigned char *cksum, unsigned char *buf) { - struct crypto_blkcipher *cipher; + struct crypto_skcipher *cipher; unsigned char plain[8]; s32 code; dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(cipher)) return PTR_ERR(cipher); @@ -68,12 +68,12 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, code = krb5_encrypt(cipher, cksum, plain, buf, 8); out: - crypto_free_blkcipher(cipher); + crypto_free_skcipher(cipher); return code; } s32 krb5_make_seq_num(struct krb5_ctx *kctx, - struct crypto_blkcipher *key, + struct crypto_skcipher *key, int direction, u32 seqnum, unsigned char *cksum, unsigned char *buf) @@ -101,13 +101,13 @@ static s32 krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, unsigned char *buf, int *direction, s32 *seqnum) { - struct crypto_blkcipher *cipher; + struct crypto_skcipher *cipher; unsigned char plain[8]; s32 code; dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(cipher)) return PTR_ERR(cipher); @@ -130,7 +130,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, *seqnum = ((plain[0] << 24) | (plain[1] << 16) | (plain[2] << 8) | (plain[3])); out: - crypto_free_blkcipher(cipher); + crypto_free_skcipher(cipher); return code; } @@ -142,7 +142,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx, { s32 code; unsigned char plain[8]; - struct crypto_blkcipher *key = kctx->seq; + struct crypto_skcipher *key = kctx->seq; dprintk("RPC: krb5_get_seq_num:\n"); diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index ca7e92a32..a737c2da0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -28,12 +28,12 @@ * SUCH DAMAGES. */ +#include <crypto/skcipher.h> #include <linux/types.h> #include <linux/jiffies.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/random.h> #include <linux/pagemap.h> -#include <linux/crypto.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH @@ -79,9 +79,9 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) len -= buf->head[0].iov_len; if (len <= buf->page_len) { unsigned int last = (buf->page_base + len - 1) - >>PAGE_CACHE_SHIFT; + >>PAGE_SHIFT; unsigned int offset = (buf->page_base + len - 1) - & (PAGE_CACHE_SIZE - 1); + & (PAGE_SIZE - 1); ptr = kmap_atomic(buf->pages[last]); pad = *(ptr + offset); kunmap_atomic(ptr); @@ -174,7 +174,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, now = get_seconds(); - blocksize = crypto_blkcipher_blocksize(kctx->enc); + blocksize = crypto_skcipher_blocksize(kctx->enc); gss_krb5_add_padding(buf, offset, blocksize); BUG_ON((buf->len - offset) % blocksize); plainlen = conflen + buf->len - offset; @@ -239,10 +239,10 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, return GSS_S_FAILURE; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_blkcipher *cipher; + struct crypto_skcipher *cipher; int err; - cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(cipher)) return GSS_S_FAILURE; @@ -250,7 +250,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, err = gss_encrypt_xdr_buf(cipher, buf, offset + headlen - conflen, pages); - crypto_free_blkcipher(cipher); + crypto_free_skcipher(cipher); if (err) return GSS_S_FAILURE; } else { @@ -327,18 +327,18 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) return GSS_S_BAD_SIG; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_blkcipher *cipher; + struct crypto_skcipher *cipher; int err; - cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(cipher)) return GSS_S_FAILURE; krb5_rc4_setup_enc_key(kctx, cipher, seqnum); err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); - crypto_free_blkcipher(cipher); + crypto_free_skcipher(cipher); if (err) return GSS_S_DEFECTIVE_TOKEN; } else { @@ -371,7 +371,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) /* Copy the data back to the right position. XXX: Would probably be * better to copy and encrypt at the same time. */ - blocksize = crypto_blkcipher_blocksize(kctx->enc); + blocksize = crypto_skcipher_blocksize(kctx->enc); data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + conflen; orig_start = buf->head[0].iov_base + offset; @@ -473,7 +473,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *ptr++ = 0xff; be16ptr = (__be16 *)ptr; - blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); + blocksize = crypto_skcipher_blocksize(kctx->acceptor_enc); *be16ptr++ = 0; /* "inner" token header always uses 0 for RRC */ *be16ptr++ = 0; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1095be9c8..4605dc73d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -857,8 +857,8 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g goto out; if (svc_getnl(&buf->head[0]) != seq) goto out; - /* trim off the mic at the end before returning */ - xdr_buf_trim(buf, mic.len + 4); + /* trim off the mic and padding at the end before returning */ + xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); stat = 0; out: kfree(mic.data); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index c2a2b584a..8d9eb4d5d 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -113,8 +113,8 @@ const struct rpc_authops authnull_ops = { static struct rpc_auth null_auth = { - .au_cslack = 4, - .au_rslack = 2, + .au_cslack = NUL_CALLSLACK, + .au_rslack = NUL_REPLYSLACK, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 548240dd1..0d3dd364c 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -23,8 +23,6 @@ struct unx_cred { }; #define uc_uid uc_base.cr_uid -#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif @@ -228,8 +226,8 @@ const struct rpc_authops authunix_ops = { static struct rpc_auth unix_auth = { - .au_cslack = UNX_WRITESLACK, - .au_rslack = 2, /* assume AUTH_NULL verf */ + .au_cslack = UNX_CALLSLACK, + .au_rslack = NUL_REPLYSLACK, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 008c25d1b..553bf95f7 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -881,7 +881,7 @@ static ssize_t cache_downcall(struct address_space *mapping, char *kaddr; ssize_t ret = -ENOMEM; - if (count >= PAGE_CACHE_SIZE) + if (count >= PAGE_SIZE) goto out_slow; page = find_or_create_page(mapping, 0, GFP_KERNEL); @@ -892,7 +892,7 @@ static ssize_t cache_downcall(struct address_space *mapping, ret = cache_do_downcall(kaddr, buf, count, cd); kunmap(page); unlock_page(page); - page_cache_release(page); + put_page(page); return ret; out_slow: return cache_slow_downcall(buf, count, cd); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b7f21044f..7e0c9bf22 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -354,6 +354,7 @@ static void rpc_free_clid(struct rpc_clnt *clnt) } static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, + struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, struct rpc_clnt *parent) { @@ -411,6 +412,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, } rpc_clnt_set_transport(clnt, xprt, timeout); + xprt_iter_init(&clnt->cl_xpi, xps); + xprt_switch_put(xps); clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); @@ -438,6 +441,7 @@ out_no_clid: out_err: rpciod_down(); out_no_rpciod: + xprt_switch_put(xps); xprt_put(xprt); return ERR_PTR(err); } @@ -446,8 +450,13 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, struct rpc_xprt *xprt) { struct rpc_clnt *clnt = NULL; + struct rpc_xprt_switch *xps; - clnt = rpc_new_client(args, xprt, NULL); + xps = xprt_switch_alloc(xprt, GFP_KERNEL); + if (xps == NULL) + return ERR_PTR(-ENOMEM); + + clnt = rpc_new_client(args, xps, xprt, NULL); if (IS_ERR(clnt)) return clnt; @@ -564,6 +573,7 @@ EXPORT_SYMBOL_GPL(rpc_create); static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, struct rpc_clnt *clnt) { + struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; struct rpc_clnt *new; int err; @@ -571,13 +581,17 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, err = -ENOMEM; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); - if (xprt == NULL) + if (xprt == NULL || xps == NULL) { + xprt_put(xprt); + xprt_switch_put(xps); goto out_err; + } args->servername = xprt->servername; args->nodename = clnt->cl_nodename; - new = rpc_new_client(args, xprt, clnt); + new = rpc_new_client(args, xps, xprt, clnt); if (IS_ERR(new)) { err = PTR_ERR(new); goto out_err; @@ -657,6 +671,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, { const struct rpc_timeout *old_timeo; rpc_authflavor_t pseudoflavor; + struct rpc_xprt_switch *xps, *oldxps; struct rpc_xprt *xprt, *old; struct rpc_clnt *parent; int err; @@ -668,10 +683,17 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, return PTR_ERR(xprt); } + xps = xprt_switch_alloc(xprt, GFP_KERNEL); + if (xps == NULL) { + xprt_put(xprt); + return -ENOMEM; + } + pseudoflavor = clnt->cl_auth->au_flavor; old_timeo = clnt->cl_timeout; old = rpc_clnt_set_transport(clnt, xprt, timeout); + oldxps = xprt_iter_xchg_switch(&clnt->cl_xpi, xps); rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); @@ -697,20 +719,74 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, synchronize_rcu(); if (parent != clnt) rpc_release_client(parent); + xprt_switch_put(oldxps); xprt_put(old); dprintk("RPC: replaced xprt for clnt %p\n", clnt); return 0; out_revert: + xps = xprt_iter_xchg_switch(&clnt->cl_xpi, oldxps); rpc_clnt_set_transport(clnt, old, old_timeo); clnt->cl_parent = parent; rpc_client_register(clnt, pseudoflavor, NULL); + xprt_switch_put(xps); xprt_put(xprt); dprintk("RPC: failed to switch xprt for clnt %p\n", clnt); return err; } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); +static +int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) +{ + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + rcu_read_unlock(); + if (xps == NULL) + return -EAGAIN; + xprt_iter_init_listall(xpi, xps); + xprt_switch_put(xps); + return 0; +} + +/** + * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports + * @clnt: pointer to client + * @fn: function to apply + * @data: void pointer to function data + * + * Iterates through the list of RPC transports currently attached to the + * client and applies the function fn(clnt, xprt, data). + * + * On error, the iteration stops, and the function returns the error value. + */ +int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt, + int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *), + void *data) +{ + struct rpc_xprt_iter xpi; + int ret; + + ret = rpc_clnt_xprt_iter_init(clnt, &xpi); + if (ret) + return ret; + for (;;) { + struct rpc_xprt *xprt = xprt_iter_get_next(&xpi); + + if (!xprt) + break; + ret = fn(clnt, xprt, data); + xprt_put(xprt); + if (ret < 0) + break; + } + xprt_iter_destroy(&xpi); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_iterate_for_each_xprt); + /* * Kill all tasks for the given client. * XXX: kill their descendants as well? @@ -783,6 +859,7 @@ rpc_free_client(struct rpc_clnt *clnt) rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; xprt_put(rcu_dereference_raw(clnt->cl_xprt)); + xprt_iter_destroy(&clnt->cl_xpi); rpciod_down(); rpc_free_clid(clnt); kfree(clnt); @@ -868,6 +945,7 @@ EXPORT_SYMBOL_GPL(rpc_bind_new_program); void rpc_task_release_client(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = task->tk_xprt; if (clnt != NULL) { /* Remove from client task list */ @@ -878,13 +956,22 @@ void rpc_task_release_client(struct rpc_task *task) rpc_release_client(clnt); } + + if (xprt != NULL) { + task->tk_xprt = NULL; + + xprt_put(xprt); + } } static void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { + if (clnt != NULL) { rpc_task_release_client(task); + if (task->tk_xprt == NULL) + task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi); task->tk_client = clnt; atomic_inc(&clnt->cl_count); if (clnt->cl_softrtry) @@ -900,14 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) } } -void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt) -{ - rpc_task_release_client(task); - rpc_task_set_client(task, clnt); -} -EXPORT_SYMBOL_GPL(rpc_task_reset_client); - - static void rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) { @@ -2104,11 +2183,9 @@ call_timeout(struct rpc_task *task) } if (RPC_IS_SOFT(task)) { if (clnt->cl_chatty) { - rcu_read_lock(); printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); - rcu_read_unlock(); + task->tk_xprt->servername); } if (task->tk_flags & RPC_TASK_TIMEOUT) rpc_exit(task, -ETIMEDOUT); @@ -2120,11 +2197,9 @@ call_timeout(struct rpc_task *task) if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; if (clnt->cl_chatty) { - rcu_read_lock(); printk(KERN_NOTICE "%s: server %s not responding, still trying\n", clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); - rcu_read_unlock(); + task->tk_xprt->servername); } } rpc_force_rebind(clnt); @@ -2154,11 +2229,9 @@ call_decode(struct rpc_task *task) if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { - rcu_read_lock(); printk(KERN_NOTICE "%s: server %s OK\n", clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); - rcu_read_unlock(); + task->tk_xprt->servername); } task->tk_flags &= ~RPC_CALL_MAJORSEEN; } @@ -2312,11 +2385,9 @@ rpc_verify_header(struct rpc_task *task) task->tk_action = call_bind; goto out_retry; case RPC_AUTH_TOOWEAK: - rcu_read_lock(); printk(KERN_NOTICE "RPC: server %s requires stronger " "authentication.\n", - rcu_dereference(clnt->cl_xprt)->servername); - rcu_read_unlock(); + task->tk_xprt->servername); break; default: dprintk("RPC: %5u %s: unknown auth error: %x\n", @@ -2341,27 +2412,27 @@ rpc_verify_header(struct rpc_task *task) case RPC_SUCCESS: return p; case RPC_PROG_UNAVAIL: - dprintk_rcu("RPC: %5u %s: program %u is unsupported " + dprintk("RPC: %5u %s: program %u is unsupported " "by server %s\n", task->tk_pid, __func__, (unsigned int)clnt->cl_prog, - rcu_dereference(clnt->cl_xprt)->servername); + task->tk_xprt->servername); error = -EPFNOSUPPORT; goto out_err; case RPC_PROG_MISMATCH: - dprintk_rcu("RPC: %5u %s: program %u, version %u unsupported " + dprintk("RPC: %5u %s: program %u, version %u unsupported " "by server %s\n", task->tk_pid, __func__, (unsigned int)clnt->cl_prog, (unsigned int)clnt->cl_vers, - rcu_dereference(clnt->cl_xprt)->servername); + task->tk_xprt->servername); error = -EPROTONOSUPPORT; goto out_err; case RPC_PROC_UNAVAIL: - dprintk_rcu("RPC: %5u %s: proc %s unsupported by program %u, " + dprintk("RPC: %5u %s: proc %s unsupported by program %u, " "version %u on server %s\n", task->tk_pid, __func__, rpc_proc_name(task), clnt->cl_prog, clnt->cl_vers, - rcu_dereference(clnt->cl_xprt)->servername); + task->tk_xprt->servername); error = -EOPNOTSUPP; goto out_err; case RPC_GARBAGE_ARGS: @@ -2421,7 +2492,10 @@ static int rpc_ping(struct rpc_clnt *clnt) return err; } -struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags) +static +struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, + const struct rpc_call_ops *ops, void *data) { struct rpc_message msg = { .rpc_proc = &rpcproc_null, @@ -2429,14 +2503,140 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, + .rpc_xprt = xprt, .rpc_message = &msg, - .callback_ops = &rpc_default_ops, + .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, + .callback_data = data, .flags = flags, }; + return rpc_run_task(&task_setup_data); } + +struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags) +{ + return rpc_call_null_helper(clnt, NULL, cred, flags, NULL, NULL); +} EXPORT_SYMBOL_GPL(rpc_call_null); +struct rpc_cb_add_xprt_calldata { + struct rpc_xprt_switch *xps; + struct rpc_xprt *xprt; +}; + +static void rpc_cb_add_xprt_done(struct rpc_task *task, void *calldata) +{ + struct rpc_cb_add_xprt_calldata *data = calldata; + + if (task->tk_status == 0) + rpc_xprt_switch_add_xprt(data->xps, data->xprt); +} + +static void rpc_cb_add_xprt_release(void *calldata) +{ + struct rpc_cb_add_xprt_calldata *data = calldata; + + xprt_put(data->xprt); + xprt_switch_put(data->xps); + kfree(data); +} + +const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { + .rpc_call_done = rpc_cb_add_xprt_done, + .rpc_release = rpc_cb_add_xprt_release, +}; + +/** + * rpc_clnt_test_and_add_xprt - Test and add a new transport to a rpc_clnt + * @clnt: pointer to struct rpc_clnt + * @xps: pointer to struct rpc_xprt_switch, + * @xprt: pointer struct rpc_xprt + * @dummy: unused + */ +int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, + void *dummy) +{ + struct rpc_cb_add_xprt_calldata *data; + struct rpc_cred *cred; + struct rpc_task *task; + + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return -ENOMEM; + data->xps = xprt_switch_get(xps); + data->xprt = xprt_get(xprt); + + cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_call_null_helper(clnt, xprt, cred, + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC, + &rpc_cb_add_xprt_call_ops, data); + put_rpccred(cred); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 1; +} +EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); + +/** + * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt + * @clnt: pointer to struct rpc_clnt + * @xprtargs: pointer to struct xprt_create + * @setup: callback to test and/or set up the connection + * @data: pointer to setup function data + * + * Creates a new transport using the parameters set in args and + * adds it to clnt. + * If ping is set, then test that connectivity succeeds before + * adding the new transport. + * + */ +int rpc_clnt_add_xprt(struct rpc_clnt *clnt, + struct xprt_create *xprtargs, + int (*setup)(struct rpc_clnt *, + struct rpc_xprt_switch *, + struct rpc_xprt *, + void *), + void *data) +{ + struct rpc_xprt_switch *xps; + struct rpc_xprt *xprt; + unsigned char resvport; + int ret = 0; + + rcu_read_lock(); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + xprt = xprt_iter_xprt(&clnt->cl_xpi); + if (xps == NULL || xprt == NULL) { + rcu_read_unlock(); + return -EAGAIN; + } + resvport = xprt->resvport; + rcu_read_unlock(); + + xprt = xprt_create_transport(xprtargs); + if (IS_ERR(xprt)) { + ret = PTR_ERR(xprt); + goto out_put_switch; + } + xprt->resvport = resvport; + + rpc_xprt_switch_set_roundrobin(xps); + if (setup) { + ret = setup(clnt, xps, xprt, data); + if (ret != 0) + goto out_put_xprt; + } + rpc_xprt_switch_add_xprt(xps, xprt); +out_put_xprt: + xprt_put(xprt); +out_put_switch: + xprt_switch_put(xps); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { @@ -2483,57 +2683,39 @@ void rpc_show_tasks(struct net *net) #endif #if IS_ENABLED(CONFIG_SUNRPC_SWAP) +static int +rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *dummy) +{ + return xprt_enable_swap(xprt); +} + int rpc_clnt_swap_activate(struct rpc_clnt *clnt) { - int ret = 0; - struct rpc_xprt *xprt; - - if (atomic_inc_return(&clnt->cl_swapper) == 1) { -retry: - rcu_read_lock(); - xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); - rcu_read_unlock(); - if (!xprt) { - /* - * If we didn't get a reference, then we likely are - * racing with a migration event. Wait for a grace - * period and try again. - */ - synchronize_rcu(); - goto retry; - } - - ret = xprt_enable_swap(xprt); - xprt_put(xprt); - } - return ret; + if (atomic_inc_return(&clnt->cl_swapper) == 1) + return rpc_clnt_iterate_for_each_xprt(clnt, + rpc_clnt_swap_activate_callback, NULL); + return 0; } EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate); +static int +rpc_clnt_swap_deactivate_callback(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *dummy) +{ + xprt_disable_swap(xprt); + return 0; +} + void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) { - struct rpc_xprt *xprt; - - if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) { -retry: - rcu_read_lock(); - xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); - rcu_read_unlock(); - if (!xprt) { - /* - * If we didn't get a reference, then we likely are - * racing with a migration event. Wait for a grace - * period and try again. - */ - synchronize_rcu(); - goto retry; - } - - xprt_disable_swap(xprt); - xprt_put(xprt); - } + if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) + rpc_clnt_iterate_for_each_xprt(clnt, + rpc_clnt_swap_deactivate_callback, NULL); } EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate); #endif /* CONFIG_SUNRPC_SWAP */ diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 31789ef3e..fc48eca21 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1390,8 +1390,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; sb->s_d_op = &simple_dentry_operations; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index cf5770d8f..5b3060359 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -648,10 +648,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) { struct rpc_clnt *parent = clnt->cl_parent; - struct rpc_xprt *xprt = rcu_dereference(clnt->cl_xprt); + struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (parent != clnt) { - if (rcu_dereference(parent->cl_xprt) != xprt) + if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; if (clnt->cl_autobind) break; @@ -683,11 +683,9 @@ void rpcb_getport_async(struct rpc_task *task) int status; rcu_read_lock(); - do { - clnt = rpcb_find_transport_owner(task->tk_client); - xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); - } while (xprt == NULL); + clnt = rpcb_find_transport_owner(task->tk_client); rcu_read_unlock(); + xprt = xprt_get(task->tk_xprt); dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __func__, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73ad57a59..fcfd48d26 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -909,6 +909,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta /* Initialize workqueue for async tasks */ task->tk_workqueue = task_setup_data->workqueue; + task->tk_xprt = xprt_get(task_setup_data->rpc_xprt); + if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 2df87f78e..de70c7802 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -96,8 +96,8 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct if (base || xdr->page_base) { pglen -= base; base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; + ppage += base >> PAGE_SHIFT; + base &= ~PAGE_MASK; } do { char *kaddr; @@ -113,7 +113,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct } } - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; kaddr = kmap_atomic(*ppage); if (base) { len -= base; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 4439ac4c1..6bdb38652 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -164,7 +164,7 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages); * Note: the addresses pgto_base and pgfrom_base are both calculated in * the same way: * if a memory area starts at byte 'base' in page 'pages[i]', - * then its address is given as (i << PAGE_CACHE_SHIFT) + base + * then its address is given as (i << PAGE_SHIFT) + base * Also note: pgfrom_base must be < pgto_base, but the memory areas * they point to may overlap. */ @@ -181,20 +181,20 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, pgto_base += len; pgfrom_base += len; - pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT); - pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT); + pgto = pages + (pgto_base >> PAGE_SHIFT); + pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); - pgto_base &= ~PAGE_CACHE_MASK; - pgfrom_base &= ~PAGE_CACHE_MASK; + pgto_base &= ~PAGE_MASK; + pgfrom_base &= ~PAGE_MASK; do { /* Are any pointers crossing a page boundary? */ if (pgto_base == 0) { - pgto_base = PAGE_CACHE_SIZE; + pgto_base = PAGE_SIZE; pgto--; } if (pgfrom_base == 0) { - pgfrom_base = PAGE_CACHE_SIZE; + pgfrom_base = PAGE_SIZE; pgfrom--; } @@ -236,11 +236,11 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) char *vto; size_t copy; - pgto = pages + (pgbase >> PAGE_CACHE_SHIFT); - pgbase &= ~PAGE_CACHE_MASK; + pgto = pages + (pgbase >> PAGE_SHIFT); + pgbase &= ~PAGE_MASK; for (;;) { - copy = PAGE_CACHE_SIZE - pgbase; + copy = PAGE_SIZE - pgbase; if (copy > len) copy = len; @@ -253,7 +253,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) break; pgbase += copy; - if (pgbase == PAGE_CACHE_SIZE) { + if (pgbase == PAGE_SIZE) { flush_dcache_page(*pgto); pgbase = 0; pgto++; @@ -280,11 +280,11 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) char *vfrom; size_t copy; - pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT); - pgbase &= ~PAGE_CACHE_MASK; + pgfrom = pages + (pgbase >> PAGE_SHIFT); + pgbase &= ~PAGE_MASK; do { - copy = PAGE_CACHE_SIZE - pgbase; + copy = PAGE_SIZE - pgbase; if (copy > len) copy = len; @@ -293,7 +293,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) kunmap_atomic(vfrom); pgbase += copy; - if (pgbase == PAGE_CACHE_SIZE) { + if (pgbase == PAGE_SIZE) { pgbase = 0; pgfrom++; } @@ -1038,8 +1038,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, if (base < buf->page_len) { subbuf->page_len = min(buf->page_len - base, len); base += buf->page_base; - subbuf->page_base = base & ~PAGE_CACHE_MASK; - subbuf->pages = &buf->pages[base >> PAGE_CACHE_SHIFT]; + subbuf->page_base = base & ~PAGE_MASK; + subbuf->pages = &buf->pages[base >> PAGE_SHIFT]; len -= subbuf->page_len; base = 0; } else { @@ -1297,9 +1297,9 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, todo -= avail_here; base += buf->page_base; - ppages = buf->pages + (base >> PAGE_CACHE_SHIFT); - base &= ~PAGE_CACHE_MASK; - avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base, + ppages = buf->pages + (base >> PAGE_SHIFT); + base &= ~PAGE_MASK; + avail_page = min_t(unsigned int, PAGE_SIZE - base, avail_here); c = kmap(*ppages) + base; @@ -1383,7 +1383,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, } avail_page = min(avail_here, - (unsigned int) PAGE_CACHE_SIZE); + (unsigned int) PAGE_SIZE); } base = buf->page_len; /* align to start of tail */ } @@ -1479,9 +1479,9 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, if (page_len > len) page_len = len; len -= page_len; - page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); - i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; - thislen = PAGE_CACHE_SIZE - page_offset; + page_offset = (offset + buf->page_base) & (PAGE_SIZE - 1); + i = (offset + buf->page_base) >> PAGE_SHIFT; + thislen = PAGE_SIZE - page_offset; do { if (thislen > page_len) thislen = page_len; @@ -1492,7 +1492,7 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, page_len -= thislen; i++; page_offset = 0; - thislen = PAGE_CACHE_SIZE; + thislen = PAGE_SIZE; } while (page_len != 0); offset = 0; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 37edea6fa..216a13857 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -48,6 +48,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/bc_xprt.h> +#include <linux/rcupdate.h> #include <trace/events/sunrpc.h> @@ -1166,7 +1167,7 @@ void xprt_free(struct rpc_xprt *xprt) { put_net(xprt->xprt_net); xprt_free_all_slots(xprt); - kfree(xprt); + kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); @@ -1180,7 +1181,7 @@ EXPORT_SYMBOL_GPL(xprt_free); */ void xprt_reserve(struct rpc_task *task) { - struct rpc_xprt *xprt; + struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) @@ -1188,11 +1189,8 @@ void xprt_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; - rcu_read_lock(); - xprt = rcu_dereference(task->tk_client->cl_xprt); if (!xprt_throttle_congested(xprt, task)) xprt->ops->alloc_slot(xprt, task); - rcu_read_unlock(); } /** @@ -1206,7 +1204,7 @@ void xprt_reserve(struct rpc_task *task) */ void xprt_retry_reserve(struct rpc_task *task) { - struct rpc_xprt *xprt; + struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) @@ -1214,10 +1212,7 @@ void xprt_retry_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; - rcu_read_lock(); - xprt = rcu_dereference(task->tk_client->cl_xprt); xprt->ops->alloc_slot(xprt, task); - rcu_read_unlock(); } static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) @@ -1264,11 +1259,9 @@ void xprt_release(struct rpc_task *task) if (req == NULL) { if (task->tk_client) { - rcu_read_lock(); - xprt = rcu_dereference(task->tk_client->cl_xprt); + xprt = task->tk_xprt; if (xprt->snd_task == task) xprt_release_write(xprt, task); - rcu_read_unlock(); } return; } @@ -1307,7 +1300,7 @@ void xprt_release(struct rpc_task *task) static void xprt_init(struct rpc_xprt *xprt, struct net *net) { - atomic_set(&xprt->count, 1); + kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); @@ -1318,6 +1311,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ + INIT_LIST_HEAD(&xprt->xprt_switch); xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; @@ -1415,6 +1409,24 @@ static void xprt_destroy(struct rpc_xprt *xprt) xprt->ops->destroy(xprt); } +static void xprt_destroy_kref(struct kref *kref) +{ + xprt_destroy(container_of(kref, struct rpc_xprt, kref)); +} + +/** + * xprt_get - return a reference to an RPC transport. + * @xprt: pointer to the transport + * + */ +struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) +{ + if (xprt != NULL && kref_get_unless_zero(&xprt->kref)) + return xprt; + return NULL; +} +EXPORT_SYMBOL_GPL(xprt_get); + /** * xprt_put - release a reference to an RPC transport. * @xprt: pointer to the transport @@ -1422,7 +1434,7 @@ static void xprt_destroy(struct rpc_xprt *xprt) */ void xprt_put(struct rpc_xprt *xprt) { - if (atomic_dec_and_test(&xprt->count)) - xprt_destroy(xprt); + if (xprt != NULL) + kref_put(&xprt->kref, xprt_destroy_kref); } EXPORT_SYMBOL_GPL(xprt_put); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c new file mode 100644 index 000000000..e7fd76975 --- /dev/null +++ b/net/sunrpc/xprtmultipath.c @@ -0,0 +1,475 @@ +/* + * Multipath support for RPC + * + * Copyright (c) 2015, 2016, Primary Data, Inc. All rights reserved. + * + * Trond Myklebust <trond.myklebust@primarydata.com> + * + */ +#include <linux/types.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <asm/cmpxchg.h> +#include <linux/spinlock.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/xprtmultipath.h> + +typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, + const struct rpc_xprt *cur); + +static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular; +static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin; +static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall; + +static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt) +{ + if (unlikely(xprt_get(xprt) == NULL)) + return; + list_add_tail_rcu(&xprt->xprt_switch, &xps->xps_xprt_list); + smp_wmb(); + if (xps->xps_nxprts == 0) + xps->xps_net = xprt->xprt_net; + xps->xps_nxprts++; +} + +/** + * rpc_xprt_switch_add_xprt - Add a new rpc_xprt to an rpc_xprt_switch + * @xps: pointer to struct rpc_xprt_switch + * @xprt: pointer to struct rpc_xprt + * + * Adds xprt to the end of the list of struct rpc_xprt in xps. + */ +void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt) +{ + if (xprt == NULL) + return; + spin_lock(&xps->xps_lock); + if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) + xprt_switch_add_xprt_locked(xps, xprt); + spin_unlock(&xps->xps_lock); +} + +static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt) +{ + if (unlikely(xprt == NULL)) + return; + xps->xps_nxprts--; + if (xps->xps_nxprts == 0) + xps->xps_net = NULL; + smp_wmb(); + list_del_rcu(&xprt->xprt_switch); +} + +/** + * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch + * @xps: pointer to struct rpc_xprt_switch + * @xprt: pointer to struct rpc_xprt + * + * Removes xprt from the list of struct rpc_xprt in xps. + */ +void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt) +{ + spin_lock(&xps->xps_lock); + xprt_switch_remove_xprt_locked(xps, xprt); + spin_unlock(&xps->xps_lock); + xprt_put(xprt); +} + +/** + * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch + * @xprt: pointer to struct rpc_xprt + * @gfp_flags: allocation flags + * + * On success, returns an initialised struct rpc_xprt_switch, containing + * the entry xprt. Returns NULL on failure. + */ +struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_xprt_switch *xps; + + xps = kmalloc(sizeof(*xps), gfp_flags); + if (xps != NULL) { + spin_lock_init(&xps->xps_lock); + kref_init(&xps->xps_kref); + xps->xps_nxprts = 0; + INIT_LIST_HEAD(&xps->xps_xprt_list); + xps->xps_iter_ops = &rpc_xprt_iter_singular; + xprt_switch_add_xprt_locked(xps, xprt); + } + + return xps; +} + +static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) +{ + spin_lock(&xps->xps_lock); + while (!list_empty(&xps->xps_xprt_list)) { + struct rpc_xprt *xprt; + + xprt = list_first_entry(&xps->xps_xprt_list, + struct rpc_xprt, xprt_switch); + xprt_switch_remove_xprt_locked(xps, xprt); + spin_unlock(&xps->xps_lock); + xprt_put(xprt); + spin_lock(&xps->xps_lock); + } + spin_unlock(&xps->xps_lock); +} + +static void xprt_switch_free(struct kref *kref) +{ + struct rpc_xprt_switch *xps = container_of(kref, + struct rpc_xprt_switch, xps_kref); + + xprt_switch_free_entries(xps); + kfree_rcu(xps, xps_rcu); +} + +/** + * xprt_switch_get - Return a reference to a rpc_xprt_switch + * @xps: pointer to struct rpc_xprt_switch + * + * Returns a reference to xps unless the refcount is already zero. + */ +struct rpc_xprt_switch *xprt_switch_get(struct rpc_xprt_switch *xps) +{ + if (xps != NULL && kref_get_unless_zero(&xps->xps_kref)) + return xps; + return NULL; +} + +/** + * xprt_switch_put - Release a reference to a rpc_xprt_switch + * @xps: pointer to struct rpc_xprt_switch + * + * Release the reference to xps, and free it once the refcount is zero. + */ +void xprt_switch_put(struct rpc_xprt_switch *xps) +{ + if (xps != NULL) + kref_put(&xps->xps_kref, xprt_switch_free); +} + +/** + * rpc_xprt_switch_set_roundrobin - Set a round-robin policy on rpc_xprt_switch + * @xps: pointer to struct rpc_xprt_switch + * + * Sets a round-robin default policy for iterators acting on xps. + */ +void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps) +{ + if (READ_ONCE(xps->xps_iter_ops) != &rpc_xprt_iter_roundrobin) + WRITE_ONCE(xps->xps_iter_ops, &rpc_xprt_iter_roundrobin); +} + +static +const struct rpc_xprt_iter_ops *xprt_iter_ops(const struct rpc_xprt_iter *xpi) +{ + if (xpi->xpi_ops != NULL) + return xpi->xpi_ops; + return rcu_dereference(xpi->xpi_xpswitch)->xps_iter_ops; +} + +static +void xprt_iter_no_rewind(struct rpc_xprt_iter *xpi) +{ +} + +static +void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi) +{ + WRITE_ONCE(xpi->xpi_cursor, NULL); +} + +static +struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head) +{ + return list_first_or_null_rcu(head, struct rpc_xprt, xprt_switch); +} + +static +struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) +{ + struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); + + if (xps == NULL) + return NULL; + return xprt_switch_find_first_entry(&xps->xps_xprt_list); +} + +static +struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, + const struct rpc_xprt *cur) +{ + struct rpc_xprt *pos; + + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (cur == pos) + return pos; + } + return NULL; +} + +static +struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) +{ + struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); + struct list_head *head; + + if (xps == NULL) + return NULL; + head = &xps->xps_xprt_list; + if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2) + return xprt_switch_find_first_entry(head); + return xprt_switch_find_current_entry(head, xpi->xpi_cursor); +} + +static +struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, + const struct rpc_xprt *cur) +{ + struct rpc_xprt *pos, *prev = NULL; + + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (cur == prev) + return pos; + prev = pos; + } + return NULL; +} + +static +struct rpc_xprt *xprt_switch_set_next_cursor(struct list_head *head, + struct rpc_xprt **cursor, + xprt_switch_find_xprt_t find_next) +{ + struct rpc_xprt *cur, *pos, *old; + + cur = READ_ONCE(*cursor); + for (;;) { + old = cur; + pos = find_next(head, old); + if (pos == NULL) + break; + cur = cmpxchg_relaxed(cursor, old, pos); + if (cur == old) + break; + } + return pos; +} + +static +struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, + xprt_switch_find_xprt_t find_next) +{ + struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); + struct list_head *head; + + if (xps == NULL) + return NULL; + head = &xps->xps_xprt_list; + if (xps->xps_nxprts < 2) + return xprt_switch_find_first_entry(head); + return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next); +} + +static +struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head, + const struct rpc_xprt *cur) +{ + struct rpc_xprt *ret; + + ret = xprt_switch_find_next_entry(head, cur); + if (ret != NULL) + return ret; + return xprt_switch_find_first_entry(head); +} + +static +struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi) +{ + return xprt_iter_next_entry_multiple(xpi, + xprt_switch_find_next_entry_roundrobin); +} + +static +struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi) +{ + return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry); +} + +/* + * xprt_iter_rewind - Resets the xprt iterator + * @xpi: pointer to rpc_xprt_iter + * + * Resets xpi to ensure that it points to the first entry in the list + * of transports. + */ +static +void xprt_iter_rewind(struct rpc_xprt_iter *xpi) +{ + rcu_read_lock(); + xprt_iter_ops(xpi)->xpi_rewind(xpi); + rcu_read_unlock(); +} + +static void __xprt_iter_init(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *xps, + const struct rpc_xprt_iter_ops *ops) +{ + rcu_assign_pointer(xpi->xpi_xpswitch, xprt_switch_get(xps)); + xpi->xpi_cursor = NULL; + xpi->xpi_ops = ops; +} + +/** + * xprt_iter_init - Initialise an xprt iterator + * @xpi: pointer to rpc_xprt_iter + * @xps: pointer to rpc_xprt_switch + * + * Initialises the iterator to use the default iterator ops + * as set in xps. This function is mainly intended for internal + * use in the rpc_client. + */ +void xprt_iter_init(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *xps) +{ + __xprt_iter_init(xpi, xps, NULL); +} + +/** + * xprt_iter_init_listall - Initialise an xprt iterator + * @xpi: pointer to rpc_xprt_iter + * @xps: pointer to rpc_xprt_switch + * + * Initialises the iterator to iterate once through the entire list + * of entries in xps. + */ +void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *xps) +{ + __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall); +} + +/** + * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch + * @xpi: pointer to rpc_xprt_iter + * @xps: pointer to a new rpc_xprt_switch or NULL + * + * Swaps out the existing xpi->xpi_xpswitch with a new value. + */ +struct rpc_xprt_switch *xprt_iter_xchg_switch(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *newswitch) +{ + struct rpc_xprt_switch __rcu *oldswitch; + + /* Atomically swap out the old xpswitch */ + oldswitch = xchg(&xpi->xpi_xpswitch, RCU_INITIALIZER(newswitch)); + if (newswitch != NULL) + xprt_iter_rewind(xpi); + return rcu_dereference_protected(oldswitch, true); +} + +/** + * xprt_iter_destroy - Destroys the xprt iterator + * @xpi pointer to rpc_xprt_iter + */ +void xprt_iter_destroy(struct rpc_xprt_iter *xpi) +{ + xprt_switch_put(xprt_iter_xchg_switch(xpi, NULL)); +} + +/** + * xprt_iter_xprt - Returns the rpc_xprt pointed to by the cursor + * @xpi: pointer to rpc_xprt_iter + * + * Returns a pointer to the struct rpc_xprt that is currently + * pointed to by the cursor. + * Caller must be holding rcu_read_lock(). + */ +struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return xprt_iter_ops(xpi)->xpi_xprt(xpi); +} + +static +struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi, + struct rpc_xprt *(*fn)(struct rpc_xprt_iter *)) +{ + struct rpc_xprt *ret; + + do { + ret = fn(xpi); + if (ret == NULL) + break; + ret = xprt_get(ret); + } while (ret == NULL); + return ret; +} + +/** + * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor + * @xpi: pointer to rpc_xprt_iter + * + * Returns a reference to the struct rpc_xprt that is currently + * pointed to by the cursor. + */ +struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi) +{ + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt); + rcu_read_unlock(); + return xprt; +} + +/** + * xprt_iter_get_next - Returns the next rpc_xprt following the cursor + * @xpi: pointer to rpc_xprt_iter + * + * Returns a reference to the struct rpc_xprt that immediately follows the + * entry pointed to by the cursor. + */ +struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi) +{ + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_next); + rcu_read_unlock(); + return xprt; +} + +/* Policy for always returning the first entry in the rpc_xprt_switch */ +static +const struct rpc_xprt_iter_ops rpc_xprt_iter_singular = { + .xpi_rewind = xprt_iter_no_rewind, + .xpi_xprt = xprt_iter_first_entry, + .xpi_next = xprt_iter_first_entry, +}; + +/* Policy for round-robin iteration of entries in the rpc_xprt_switch */ +static +const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin = { + .xpi_rewind = xprt_iter_default_rewind, + .xpi_xprt = xprt_iter_current_entry, + .xpi_next = xprt_iter_next_entry_roundrobin, +}; + +/* Policy for once-through iteration of entries in the rpc_xprt_switch */ +static +const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = { + .xpi_rewind = xprt_iter_default_rewind, + .xpi_xprt = xprt_iter_current_entry, + .xpi_next = xprt_iter_next_entry_all, +}; diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index c14f3a4bf..b289e1065 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -80,13 +80,13 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) if (!r) goto out; - r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * - sizeof(u64), GFP_KERNEL); - if (!r->r.fmr.physaddrs) + r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * + sizeof(u64), GFP_KERNEL); + if (!r->fmr.physaddrs) goto out_free; - r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr.fmr)) + r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->fmr.fmr)) goto out_fmr_err; list_add(&r->mw_list, &buf->rb_mws); @@ -95,9 +95,9 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) return 0; out_fmr_err: - rc = PTR_ERR(r->r.fmr.fmr); + rc = PTR_ERR(r->fmr.fmr); dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); - kfree(r->r.fmr.physaddrs); + kfree(r->fmr.physaddrs); out_free: kfree(r); out: @@ -109,7 +109,7 @@ __fmr_unmap(struct rpcrdma_mw *r) { LIST_HEAD(l); - list_add(&r->r.fmr.fmr->list, &l); + list_add(&r->fmr.fmr->list, &l); return ib_unmap_fmr(&l); } @@ -148,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { rpcrdma_map_one(device, seg, direction); - mw->r.fmr.physaddrs[i] = seg->mr_dma; + mw->fmr.physaddrs[i] = seg->mr_dma; len += seg->mr_len; ++seg; ++i; @@ -158,13 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, break; } - rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs, + rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, i, seg1->mr_dma); if (rc) goto out_maperr; seg1->rl_mw = mw; - seg1->mr_rkey = mw->r.fmr.fmr->rkey; + seg1->mr_rkey = mw->fmr.fmr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; @@ -219,7 +219,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) seg = &req->rl_segments[i]; mw = seg->rl_mw; - list_add(&mw->r.fmr.fmr->list, &unmap_list); + list_add(&mw->fmr.fmr->list, &unmap_list); i += seg->mr_nsegs; } @@ -281,9 +281,9 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) while (!list_empty(&buf->rb_all)) { r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); list_del(&r->mw_all); - kfree(r->r.fmr.physaddrs); + kfree(r->fmr.physaddrs); - rc = ib_dealloc_fmr(r->r.fmr.fmr); + rc = ib_dealloc_fmr(r->fmr.fmr); if (rc) dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", __func__, rc); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index e16567389..c250924a9 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -109,20 +109,20 @@ static void __frwr_recovery_worker(struct work_struct *work) { struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, - r.frmr.fr_work); - struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt; + frmr.fr_work); + struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt; unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - if (ib_dereg_mr(r->r.frmr.fr_mr)) + if (ib_dereg_mr(r->frmr.fr_mr)) goto out_fail; - r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); - if (IS_ERR(r->r.frmr.fr_mr)) + r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); + if (IS_ERR(r->frmr.fr_mr)) goto out_fail; dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); - r->r.frmr.fr_state = FRMR_IS_INVALID; + r->frmr.fr_state = FRMR_IS_INVALID; rpcrdma_put_mw(r_xprt, r); return; @@ -137,15 +137,15 @@ out_fail: static void __frwr_queue_recovery(struct rpcrdma_mw *r) { - INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker); - queue_work(frwr_recovery_wq, &r->r.frmr.fr_work); + INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker); + queue_work(frwr_recovery_wq, &r->frmr.fr_work); } static int __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, unsigned int depth) { - struct rpcrdma_frmr *f = &r->r.frmr; + struct rpcrdma_frmr *f = &r->frmr; int rc; f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); @@ -158,6 +158,8 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, sg_init_table(f->sg, depth); + init_completion(&f->fr_linv_done); + return 0; out_mr_err: @@ -179,11 +181,11 @@ __frwr_release(struct rpcrdma_mw *r) { int rc; - rc = ib_dereg_mr(r->r.frmr.fr_mr); + rc = ib_dereg_mr(r->frmr.fr_mr); if (rc) dprintk("RPC: %s: ib_dereg_mr status %i\n", __func__, rc); - kfree(r->r.frmr.sg); + kfree(r->frmr.sg); } static int @@ -244,39 +246,76 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } -/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs - * to be reset. +static void +__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr, + const char *wr) +{ + frmr->fr_state = FRMR_IS_STALE; + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("rpcrdma: %s: %s (%u/0x%x)\n", + wr, ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); +} + +/** + * frwr_wc_fastreg - Invoked by RDMA provider for each polled FastReg WC + * @cq: completion queue (ignored) + * @wc: completed WR * - * WARNING: Only wr_id and status are reliable at this point */ static void -__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r) +frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - - /* WARNING: Only wr_id and status are reliable at this point */ - r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - if (wc->status == IB_WC_WR_FLUSH_ERR) - dprintk("RPC: %s: frmr %p flushed\n", __func__, r); - else - pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", - __func__, r, ib_wc_status_msg(wc->status), wc->status); + struct rpcrdma_frmr *frmr; + struct ib_cqe *cqe; - r->r.frmr.fr_state = FRMR_IS_STALE; + /* WARNING: Only wr_cqe and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS) { + cqe = wc->wr_cqe; + frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); + __frwr_sendcompletion_flush(wc, frmr, "fastreg"); + } } +/** + * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + */ static void -frwr_sendcompletion(struct ib_wc *wc) +frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - struct rpcrdma_frmr *f = &r->r.frmr; + struct rpcrdma_frmr *frmr; + struct ib_cqe *cqe; - if (unlikely(wc->status != IB_WC_SUCCESS)) - __frwr_sendcompletion_flush(wc, r); + /* WARNING: Only wr_cqe and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS) { + cqe = wc->wr_cqe; + frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); + __frwr_sendcompletion_flush(wc, frmr, "localinv"); + } +} - if (f->fr_waiter) - complete(&f->fr_linv_done); +/** + * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + * Awaken anyone waiting for an MR to finish being fenced. + */ +static void +frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rpcrdma_frmr *frmr; + struct ib_cqe *cqe; + + /* WARNING: Only wr_cqe and status are reliable at this point */ + cqe = wc->wr_cqe; + frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); + if (wc->status != IB_WC_SUCCESS) + __frwr_sendcompletion_flush(wc, frmr, "localinv"); + complete_all(&frmr->fr_linv_done); } static int @@ -313,8 +352,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt) list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); - r->mw_sendcompletion = frwr_sendcompletion; - r->r.frmr.fr_xprt = r_xprt; + r->frmr.fr_xprt = r_xprt; } return 0; @@ -347,10 +385,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw = rpcrdma_get_mw(r_xprt); if (!mw) return -ENOMEM; - } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); - frmr = &mw->r.frmr; + } while (mw->frmr.fr_state != FRMR_IS_INVALID); + frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; - frmr->fr_waiter = false; mr = frmr->fr_mr; reg_wr = &frmr->fr_regwr; @@ -400,7 +437,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, reg_wr->wr.next = NULL; reg_wr->wr.opcode = IB_WR_REG_MR; - reg_wr->wr.wr_id = (uintptr_t)mw; + frmr->fr_cqe.done = frwr_wc_fastreg; + reg_wr->wr.wr_cqe = &frmr->fr_cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = 0; reg_wr->mr = mr; @@ -434,15 +472,15 @@ static struct ib_send_wr * __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) { struct rpcrdma_mw *mw = seg->rl_mw; - struct rpcrdma_frmr *f = &mw->r.frmr; + struct rpcrdma_frmr *f = &mw->frmr; struct ib_send_wr *invalidate_wr; - f->fr_waiter = false; f->fr_state = FRMR_IS_INVALID; invalidate_wr = &f->fr_invwr; memset(invalidate_wr, 0, sizeof(*invalidate_wr)); - invalidate_wr->wr_id = (unsigned long)(void *)mw; + f->fr_cqe.done = frwr_wc_localinv; + invalidate_wr->wr_cqe = &f->fr_cqe; invalidate_wr->opcode = IB_WR_LOCAL_INV; invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; @@ -455,7 +493,7 @@ __frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, { struct ib_device *device = r_xprt->rx_ia.ri_device; struct rpcrdma_mw *mw = seg->rl_mw; - struct rpcrdma_frmr *f = &mw->r.frmr; + struct rpcrdma_frmr *f = &mw->frmr; seg->rl_mw = NULL; @@ -504,15 +542,15 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) i += seg->mr_nsegs; } - f = &seg->rl_mw->r.frmr; + f = &seg->rl_mw->frmr; /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain * are complete. */ f->fr_invwr.send_flags = IB_SEND_SIGNALED; - f->fr_waiter = true; - init_completion(&f->fr_linv_done); + f->fr_cqe.done = frwr_wc_localinv_wake; + reinit_completion(&f->fr_linv_done); INIT_CQCOUNT(&r_xprt->rx_ep); /* Transport disconnect drains the receive CQ before it @@ -520,14 +558,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless ri_id->qp is a valid pointer. */ rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); - if (rc) + if (rc) { pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + rdma_disconnect(ia->ri_id); + goto unmap; + } wait_for_completion(&f->fr_linv_done); /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ +unmap: for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { seg = &req->rl_segments[i]; @@ -549,7 +591,7 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct rpcrdma_frmr *frmr = &mw->frmr; struct ib_send_wr *invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; @@ -557,10 +599,11 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) seg1->rl_mw = NULL; frmr->fr_state = FRMR_IS_INVALID; - invalidate_wr = &mw->r.frmr.fr_invwr; + invalidate_wr = &mw->frmr.fr_invwr; memset(invalidate_wr, 0, sizeof(*invalidate_wr)); - invalidate_wr->wr_id = (uintptr_t)mw; + frmr->fr_cqe.done = frwr_wc_localinv; + invalidate_wr->wr_cqe = &frmr->fr_cqe; invalidate_wr->opcode = IB_WR_LOCAL_INV; invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index dbb302ecf..481b9b6f4 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -68,7 +68,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); seg->mr_rkey = ia->ri_dma_mr->rkey; seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; return 1; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0f28f2d74..888823bb6 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -132,6 +132,33 @@ rpcrdma_tail_pullup(struct xdr_buf *buf) return tlen; } +/* Split "vec" on page boundaries into segments. FMR registers pages, + * not a byte range. Other modes coalesce these segments into a single + * MR when they can. + */ +static int +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, + int n, int nsegs) +{ + size_t page_offset; + u32 remaining; + char *base; + + base = vec->iov_base; + page_offset = offset_in_page(base); + remaining = vec->iov_len; + while (remaining && n < nsegs) { + seg[n].mr_page = NULL; + seg[n].mr_offset = base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); + remaining -= seg[n].mr_len; + base += seg[n].mr_len; + ++n; + page_offset = 0; + } + return n; +} + /* * Chunk assembly from upper layer xdr_buf. * @@ -150,11 +177,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, int page_base; struct page **ppages; - if (pos == 0 && xdrbuf->head[0].iov_len) { - seg[n].mr_page = NULL; - seg[n].mr_offset = xdrbuf->head[0].iov_base; - seg[n].mr_len = xdrbuf->head[0].iov_len; - ++n; + if (pos == 0) { + n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); + if (n == nsegs) + return -EIO; } len = xdrbuf->page_len; @@ -192,13 +218,9 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * xdr pad bytes, saving the server an RDMA operation. */ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) return n; + n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); if (n == nsegs) - /* Tail remains, but we're out of segments */ return -EIO; - seg[n].mr_page = NULL; - seg[n].mr_offset = xdrbuf->tail[0].iov_base; - seg[n].mr_len = xdrbuf->tail[0].iov_len; - ++n; } return n; @@ -773,20 +795,17 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; __be32 *iptr; - int rdmalen, status; + int rdmalen, status, rmerr; unsigned long cwnd; - u32 credits; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); if (rep->rr_len == RPCRDMA_BAD_LEN) goto out_badstatus; - if (rep->rr_len < RPCRDMA_HDRLEN_MIN) + if (rep->rr_len < RPCRDMA_HDRLEN_ERR) goto out_shortreply; headerp = rdmab_to_msg(rep->rr_rdmabuf); - if (headerp->rm_vers != rpcrdma_version) - goto out_badversion; #if defined(CONFIG_SUNRPC_BACKCHANNEL) if (rpcrdma_is_bcall(headerp)) goto out_bcall; @@ -809,15 +828,16 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) */ list_del_init(&rqst->rq_list); spin_unlock_bh(&xprt->transport_lock); - dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" - " RPC request 0x%p xid 0x%08x\n", - __func__, rep, req, rqst, - be32_to_cpu(headerp->rm_xid)); + dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", + __func__, rep, req, be32_to_cpu(headerp->rm_xid)); /* from here on, the reply is no longer an orphan */ req->rl_reply = rep; xprt->reestablish_timeout = 0; + if (headerp->rm_vers != rpcrdma_version) + goto out_badversion; + /* check for expected message types */ /* The order of some of these tests is important. */ switch (headerp->rm_type) { @@ -878,6 +898,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) status = rdmalen; break; + case rdma_error: + goto out_rdmaerr; + badheader: default: dprintk("%s: invalid rpcrdma reply header (type %d):" @@ -893,6 +916,7 @@ badheader: break; } +out: /* Invalidate and flush the data payloads before waking the * waiting application. This guarantees the memory region is * properly fenced from the server before the application @@ -903,15 +927,9 @@ badheader: if (req->rl_nchunks) r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); - credits = be32_to_cpu(headerp->rm_credit); - if (credits == 0) - credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_buf.rb_max_requests) - credits = r_xprt->rx_buf.rb_max_requests; - spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; - xprt->cwnd = credits << RPC_CWNDSHIFT; + xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); @@ -935,13 +953,43 @@ out_bcall: return; #endif -out_shortreply: - dprintk("RPC: %s: short/invalid reply\n", __func__); - goto repost; - +/* If the incoming reply terminated a pending RPC, the next + * RPC call will post a replacement receive buffer as it is + * being marshaled. + */ out_badversion: dprintk("RPC: %s: invalid version %d\n", __func__, be32_to_cpu(headerp->rm_vers)); + status = -EIO; + r_xprt->rx_stats.bad_reply_count++; + goto out; + +out_rdmaerr: + rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); + switch (rmerr) { + case ERR_VERS: + pr_err("%s: server reports header version error (%u-%u)\n", + __func__, + be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), + be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); + break; + case ERR_CHUNK: + pr_err("%s: server reports header decoding error\n", + __func__); + break; + default: + pr_err("%s: server reports unknown error %d\n", + __func__, rmerr); + } + status = -EREMOTEIO; + r_xprt->rx_stats.bad_reply_count++; + goto out; + +/* If no pending RPC transaction was matched, post a replacement + * receive buffer before returning. + */ +out_shortreply: + dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; out_nomatch: diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 65a7c232a..a2a7519b0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -107,26 +107,18 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, int ret; vec = svc_rdma_get_req_map(rdma); - ret = svc_rdma_map_xdr(rdma, sndbuf, vec); + ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false); if (ret) goto out_err; - /* Post a recv buffer to handle the reply for this request. */ - ret = svc_rdma_post_recv(rdma, GFP_NOIO); - if (ret) { - pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n", - ret); - pr_err("svcrdma: closing transport %p.\n", rdma); - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - ret = -ENOTCONN; + ret = svc_rdma_repost_recv(rdma, GFP_NOIO); + if (ret) goto out_err; - } ctxt = svc_rdma_get_context(rdma); ctxt->pages[0] = virt_to_page(rqst->rq_buffer); ctxt->count = 1; - ctxt->wr_op = IB_WR_SEND; ctxt->direction = DMA_TO_DEVICE; ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = sndbuf->len; @@ -140,7 +132,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, atomic_inc(&rdma->sc_dma_used); memset(&send_wr, 0, sizeof(send_wr)); - send_wr.wr_id = (unsigned long)ctxt; + ctxt->cqe.done = svc_rdma_wc_send; + send_wr.wr_cqe = &ctxt->cqe; send_wr.sg_list = ctxt->sge; send_wr.num_sge = 1; send_wr.opcode = IB_WR_SEND; diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index e2fca7617..765bca47c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -145,29 +145,44 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) return (__be32 *)&ary->wc_array[nchunks]; } -int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, - struct svc_rqst *rqstp) +int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp) { - struct rpcrdma_msg *rmsgp = NULL; __be32 *va, *vaend; + unsigned int len; u32 hdr_len; - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - /* Verify that there's enough bytes for header + something */ - if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) { dprintk("svcrdma: header too short = %d\n", rqstp->rq_arg.len); return -EINVAL; } - if (rmsgp->rm_vers != rpcrdma_version) - return -ENOSYS; - - /* Pull in the extra for the padded case and bump our pointer */ - if (rmsgp->rm_type == rdma_msgp) { - int hdrlen; + if (rmsgp->rm_vers != rpcrdma_version) { + dprintk("%s: bad version %u\n", __func__, + be32_to_cpu(rmsgp->rm_vers)); + return -EPROTONOSUPPORT; + } + switch (be32_to_cpu(rmsgp->rm_type)) { + case RDMA_MSG: + case RDMA_NOMSG: + break; + + case RDMA_DONE: + /* Just drop it */ + dprintk("svcrdma: dropping RDMA_DONE message\n"); + return 0; + + case RDMA_ERROR: + /* Possible if this is a backchannel reply. + * XXX: We should cancel this XID, though. + */ + dprintk("svcrdma: dropping RDMA_ERROR message\n"); + return 0; + + case RDMA_MSGP: + /* Pull in the extra for the padded case, bump our pointer */ rmsgp->rm_body.rm_padded.rm_align = be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); rmsgp->rm_body.rm_padded.rm_thresh = @@ -175,11 +190,15 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; rqstp->rq_arg.head[0].iov_base = va; - hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); - rqstp->rq_arg.head[0].iov_len -= hdrlen; - if (hdrlen > rqstp->rq_arg.len) + len = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= len; + if (len > rqstp->rq_arg.len) return -EINVAL; - return hdrlen; + return len; + default: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpu(rmsgp->rm_type)); + return -EINVAL; } /* The chunk list may contain either a read chunk list or a write @@ -188,20 +207,25 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, va = &rmsgp->rm_body.rm_chunks[0]; vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); va = decode_read_list(va, vaend); - if (!va) + if (!va) { + dprintk("svcrdma: failed to decode read list\n"); return -EINVAL; + } va = decode_write_list(va, vaend); - if (!va) + if (!va) { + dprintk("svcrdma: failed to decode write list\n"); return -EINVAL; + } va = decode_reply_array(va, vaend); - if (!va) + if (!va) { + dprintk("svcrdma: failed to decode reply chunk\n"); return -EINVAL; + } rqstp->rq_arg.head[0].iov_base = va; hdr_len = (unsigned long)va - (unsigned long)rmsgp; rqstp->rq_arg.head[0].iov_len -= hdr_len; - *rdma_req = rmsgp; return hdr_len; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c8b8a8b41..3b24a646e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -180,9 +180,9 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); memset(&read_wr, 0, sizeof(read_wr)); - read_wr.wr.wr_id = (unsigned long)ctxt; + ctxt->cqe.done = svc_rdma_wc_read; + read_wr.wr.wr_cqe = &ctxt->cqe; read_wr.wr.opcode = IB_WR_RDMA_READ; - ctxt->wr_op = read_wr.wr.opcode; read_wr.wr.send_flags = IB_SEND_SIGNALED; read_wr.rkey = rs_handle; read_wr.remote_addr = rs_offset; @@ -299,8 +299,9 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, ctxt->read_hdr = head; /* Prepare REG WR */ + ctxt->reg_cqe.done = svc_rdma_wc_reg; + reg_wr.wr.wr_cqe = &ctxt->reg_cqe; reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = 0; reg_wr.wr.send_flags = IB_SEND_SIGNALED; reg_wr.wr.num_sge = 0; reg_wr.mr = frmr->mr; @@ -310,6 +311,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, /* Prepare RDMA_READ */ memset(&read_wr, 0, sizeof(read_wr)); + ctxt->cqe.done = svc_rdma_wc_read; + read_wr.wr.wr_cqe = &ctxt->cqe; read_wr.wr.send_flags = IB_SEND_SIGNALED; read_wr.rkey = rs_handle; read_wr.remote_addr = rs_offset; @@ -317,19 +320,18 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, read_wr.wr.num_sge = 1; if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - read_wr.wr.wr_id = (unsigned long)ctxt; read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; } else { read_wr.wr.opcode = IB_WR_RDMA_READ; read_wr.wr.next = &inv_wr; /* Prepare invalidate */ memset(&inv_wr, 0, sizeof(inv_wr)); - inv_wr.wr_id = (unsigned long)ctxt; + ctxt->inv_cqe.done = svc_rdma_wc_inv; + inv_wr.wr_cqe = &ctxt->inv_cqe; inv_wr.opcode = IB_WR_LOCAL_INV; inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; inv_wr.ex.invalidate_rkey = frmr->mr->lkey; } - ctxt->wr_op = read_wr.wr.opcode; /* Post the chain */ ret = svc_rdma_send(xprt, ®_wr.wr); @@ -612,7 +614,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_op_ctxt *ctxt = NULL; struct rpcrdma_msg *rmsgp; int ret = 0; - int len; dprintk("svcrdma: rqstp=%p\n", rqstp); @@ -642,8 +643,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) * transport list */ if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) - goto close_out; - + goto defer; goto out; } dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", @@ -654,15 +654,13 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); /* Decode the RDMA header. */ - len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); - rqstp->rq_xprt_hlen = len; - - /* If the request is invalid, reply with an error */ - if (len < 0) { - if (len == -ENOSYS) - svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); - goto close_out; - } + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + ret = svc_rdma_xdr_decode_req(rmsgp, rqstp); + if (ret < 0) + goto out_err; + if (ret == 0) + goto out_drop; + rqstp->rq_xprt_hlen = ret; if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, @@ -698,26 +696,16 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_xprt_copy_addrs(rqstp, xprt); return ret; - close_out: - if (ctxt) - svc_rdma_put_context(ctxt, 1); - dprintk("svcrdma: transport %p is closing\n", xprt); - /* - * Set the close bit and enqueue it. svc_recv will see the - * close bit and call svc_xprt_delete - */ - set_bit(XPT_CLOSE, &xprt->xpt_flags); +out_err: + svc_rdma_send_error(rdma_xprt, rmsgp, ret); + svc_rdma_put_context(ctxt, 0); + return 0; + defer: return 0; +out_drop: + svc_rdma_put_context(ctxt, 1); repost: - ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL); - if (ret) { - pr_err("svcrdma: could not post a receive buffer, err=%d.\n", - ret); - pr_err("svcrdma: closing transport %p.\n", rdma_xprt); - set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags); - ret = -ENOTCONN; - } - return ret; + return svc_rdma_repost_recv(rdma_xprt, GFP_KERNEL); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index df57f3ce6..4f1b1c4f4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -50,9 +50,15 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static u32 xdr_padsize(u32 len) +{ + return (len & 3) ? (4 - (len & 3)) : 0; +} + int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) + struct svc_rdma_req_map *vec, + bool write_chunk_present) { int sge_no; u32 sge_bytes; @@ -92,9 +98,20 @@ int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, /* Tail SGE */ if (xdr->tail[0].iov_len) { - vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; - vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; - sge_no++; + unsigned char *base = xdr->tail[0].iov_base; + size_t len = xdr->tail[0].iov_len; + u32 xdr_pad = xdr_padsize(xdr->page_len); + + if (write_chunk_present && xdr_pad) { + base += xdr_pad; + len -= xdr_pad; + } + + if (len) { + vec->sge[sge_no].iov_base = base; + vec->sge[sge_no].iov_len = len; + sge_no++; + } } dprintk("svcrdma: %s: sge_no %d page_no %d " @@ -166,10 +183,10 @@ svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) * reply array is present */ static struct rpcrdma_write_array * -svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) +svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp, + struct rpcrdma_write_array *wr_ary) { struct rpcrdma_read_chunk *rch; - struct rpcrdma_write_array *wr_ary; struct rpcrdma_write_array *rp_ary; /* XXX: Need to fix when reply chunk may occur with read list @@ -191,7 +208,6 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) goto found_it; } - wr_ary = svc_rdma_get_write_array(rmsgp); if (wr_ary) { int chunk = be32_to_cpu(wr_ary->wc_nchunks); @@ -281,8 +297,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); - ctxt->wr_op = IB_WR_RDMA_WRITE; - write_wr.wr.wr_id = (unsigned long)ctxt; + ctxt->cqe.done = svc_rdma_wc_write; + write_wr.wr.wr_cqe = &ctxt->cqe; write_wr.wr.sg_list = &sge[0]; write_wr.wr.num_sge = sge_no; write_wr.wr.opcode = IB_WR_RDMA_WRITE; @@ -298,41 +314,37 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, err: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 0); - /* Fatal error, close transport */ return -EIO; } +noinline static int send_write_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, + struct rpcrdma_write_array *wr_ary, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, struct svc_rdma_req_map *vec) { - u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + u32 xfer_len = rqstp->rq_res.page_len; int write_len; u32 xdr_off; int chunk_off; int chunk_no; int nchunks; - struct rpcrdma_write_array *arg_ary; struct rpcrdma_write_array *res_ary; int ret; - arg_ary = svc_rdma_get_write_array(rdma_argp); - if (!arg_ary) - return 0; res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; /* Write chunks start at the pagelist */ - nchunks = be32_to_cpu(arg_ary->wc_nchunks); + nchunks = be32_to_cpu(wr_ary->wc_nchunks); for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; xfer_len && chunk_no < nchunks; chunk_no++) { struct rpcrdma_segment *arg_ch; u64 rs_offset; - arg_ch = &arg_ary->wc_array[chunk_no].wc_target; + arg_ch = &wr_ary->wc_array[chunk_no].wc_target; write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length)); /* Prepare the response chunk given the length actually @@ -350,11 +362,8 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, xdr_off, write_len, vec); - if (ret <= 0) { - dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", - ret); - return -EIO; - } + if (ret <= 0) + goto out_err; chunk_off += ret; xdr_off += ret; xfer_len -= ret; @@ -364,11 +373,16 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, /* Update the req with the number of chunks actually used */ svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); - return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + return rqstp->rq_res.page_len; + +out_err: + pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret); + return -EIO; } +noinline static int send_reply_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, + struct rpcrdma_write_array *rp_ary, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, struct svc_rdma_req_map *vec) @@ -380,25 +394,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, int chunk_off; int nchunks; struct rpcrdma_segment *ch; - struct rpcrdma_write_array *arg_ary; struct rpcrdma_write_array *res_ary; int ret; - arg_ary = svc_rdma_get_reply_array(rdma_argp); - if (!arg_ary) - return 0; /* XXX: need to fix when reply lists occur with read-list and or * write-list */ res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; /* xdr offset starts at RPC message */ - nchunks = be32_to_cpu(arg_ary->wc_nchunks); + nchunks = be32_to_cpu(rp_ary->wc_nchunks); for (xdr_off = 0, chunk_no = 0; xfer_len && chunk_no < nchunks; chunk_no++) { u64 rs_offset; - ch = &arg_ary->wc_array[chunk_no].wc_target; + ch = &rp_ary->wc_array[chunk_no].wc_target; write_len = min(xfer_len, be32_to_cpu(ch->rs_length)); /* Prepare the reply chunk given the length actually @@ -415,11 +425,8 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, xdr_off, write_len, vec); - if (ret <= 0) { - dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", - ret); - return -EIO; - } + if (ret <= 0) + goto out_err; chunk_off += ret; xdr_off += ret; xfer_len -= ret; @@ -430,6 +437,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); return rqstp->rq_res.len; + +out_err: + pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret); + return -EIO; } /* This function prepares the portion of the RPCRDMA message to be @@ -464,13 +475,8 @@ static int send_reply(struct svcxprt_rdma *rdma, int pages; int ret; - /* Post a recv buffer to handle another request. */ - ret = svc_rdma_post_recv(rdma, GFP_KERNEL); + ret = svc_rdma_repost_recv(rdma, GFP_KERNEL); if (ret) { - printk(KERN_INFO - "svcrdma: could not post a receive buffer, err=%d." - "Closing transport %p.\n", ret, rdma); - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 0); return -ENOTCONN; } @@ -543,8 +549,8 @@ static int send_reply(struct svcxprt_rdma *rdma, goto err; } memset(&send_wr, 0, sizeof send_wr); - ctxt->wr_op = IB_WR_SEND; - send_wr.wr_id = (unsigned long)ctxt; + ctxt->cqe.done = svc_rdma_wc_send; + send_wr.wr_cqe = &ctxt->cqe; send_wr.sg_list = ctxt->sge; send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; @@ -559,6 +565,7 @@ static int send_reply(struct svcxprt_rdma *rdma, err: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); + pr_err("svcrdma: failed to send reply, rc=%d\n", ret); return -EIO; } @@ -573,7 +580,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) container_of(xprt, struct svcxprt_rdma, sc_xprt); struct rpcrdma_msg *rdma_argp; struct rpcrdma_msg *rdma_resp; - struct rpcrdma_write_array *reply_ary; + struct rpcrdma_write_array *wr_ary, *rp_ary; enum rpcrdma_proc reply_type; int ret; int inline_bytes; @@ -587,12 +594,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * places this at the start of page 0. */ rdma_argp = page_address(rqstp->rq_pages[0]); + wr_ary = svc_rdma_get_write_array(rdma_argp); + rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; vec = svc_rdma_get_req_map(rdma); - ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec); + ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); if (ret) goto err0; inline_bytes = rqstp->rq_res.len; @@ -603,8 +612,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!res_page) goto err0; rdma_resp = page_address(res_page); - reply_ary = svc_rdma_get_reply_array(rdma_argp); - if (reply_ary) + if (rp_ary) reply_type = RDMA_NOMSG; else reply_type = RDMA_MSG; @@ -612,27 +620,26 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) rdma_resp, reply_type); /* Send any write-chunk data and build resp write-list */ - ret = send_write_chunks(rdma, rdma_argp, rdma_resp, - rqstp, vec); - if (ret < 0) { - printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", - ret); - goto err1; + if (wr_ary) { + ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec); + if (ret < 0) + goto err1; + inline_bytes -= ret + xdr_padsize(ret); } - inline_bytes -= ret; /* Send any reply-list data and update resp reply-list */ - ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, - rqstp, vec); - if (ret < 0) { - printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", - ret); - goto err1; + if (rp_ary) { + ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec); + if (ret < 0) + goto err1; + inline_bytes -= ret; } - inline_bytes -= ret; ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); + if (ret < 0) + goto err1; + svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; @@ -642,5 +649,68 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) err0: svc_rdma_put_req_map(rdma, vec); svc_rdma_put_context(ctxt, 0); - return ret; + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + return -ENOTCONN; +} + +void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + int status) +{ + struct ib_send_wr err_wr; + struct page *p; + struct svc_rdma_op_ctxt *ctxt; + enum rpcrdma_errcode err; + __be32 *va; + int length; + int ret; + + ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); + if (ret) + return; + + p = alloc_page(GFP_KERNEL); + if (!p) + return; + va = page_address(p); + + /* XDR encode an error reply */ + err = ERR_CHUNK; + if (status == -EPROTONOSUPPORT) + err = ERR_VERS; + length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); + + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_TO_DEVICE; + ctxt->count = 1; + ctxt->pages[0] = p; + + /* Prepare SGE for local address */ + ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; + ctxt->sge[0].length = length; + ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, + p, 0, length, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { + dprintk("svcrdma: Error mapping buffer for protocol error\n"); + svc_rdma_put_context(ctxt, 1); + return; + } + atomic_inc(&xprt->sc_dma_used); + + /* Prepare SEND WR */ + memset(&err_wr, 0, sizeof(err_wr)); + ctxt->cqe.done = svc_rdma_wc_send; + err_wr.wr_cqe = &ctxt->cqe; + err_wr.sg_list = ctxt->sge; + err_wr.num_sge = 1; + err_wr.opcode = IB_WR_SEND; + err_wr.send_flags = IB_SEND_SIGNALED; + + /* Post It */ + ret = svc_rdma_send(xprt, &err_wr); + if (ret) { + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + } } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 5763825d0..90668969d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -63,17 +63,10 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, int flags); static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); static void svc_rdma_release_rqst(struct svc_rqst *); -static void dto_tasklet_func(unsigned long data); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); static int svc_rdma_secure_port(struct svc_rqst *); -static void rq_cq_reap(struct svcxprt_rdma *xprt); -static void sq_cq_reap(struct svcxprt_rdma *xprt); - -static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); -static DEFINE_SPINLOCK(dto_lock); -static LIST_HEAD(dto_xprt_q); static struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, @@ -352,15 +345,6 @@ static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) } } -/* ib_cq event handler */ -static void cq_event_handler(struct ib_event *event, void *context) -{ - struct svc_xprt *xprt = context; - dprintk("svcrdma: received CQ event %s (%d), context=%p\n", - ib_event_msg(event->event), event->event, context); - set_bit(XPT_CLOSE, &xprt->xpt_flags); -} - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { @@ -392,251 +376,171 @@ static void qp_event_handler(struct ib_event *event, void *context) } } -/* - * Data Transfer Operation Tasklet +/** + * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC + * @cq: completion queue + * @wc: completed WR * - * Walks a list of transports with I/O pending, removing entries as - * they are added to the server's I/O pending list. Two bits indicate - * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave - * spinlock that serializes access to the transport list with the RQ - * and SQ interrupt handlers. */ -static void dto_tasklet_func(unsigned long data) +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { - struct svcxprt_rdma *xprt; - unsigned long flags; + struct svcxprt_rdma *xprt = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_op_ctxt *ctxt; - spin_lock_irqsave(&dto_lock, flags); - while (!list_empty(&dto_xprt_q)) { - xprt = list_entry(dto_xprt_q.next, - struct svcxprt_rdma, sc_dto_q); - list_del_init(&xprt->sc_dto_q); - spin_unlock_irqrestore(&dto_lock, flags); + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ + ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); + ctxt->wc_status = wc->status; + svc_rdma_unmap_dma(ctxt); - rq_cq_reap(xprt); - sq_cq_reap(xprt); + if (wc->status != IB_WC_SUCCESS) + goto flushed; - svc_xprt_put(&xprt->sc_xprt); - spin_lock_irqsave(&dto_lock, flags); - } - spin_unlock_irqrestore(&dto_lock, flags); + /* All wc fields are now known to be valid */ + ctxt->byte_len = wc->byte_len; + spin_lock(&xprt->sc_rq_dto_lock); + list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + spin_unlock(&xprt->sc_rq_dto_lock); + + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + goto out; + svc_xprt_enqueue(&xprt->sc_xprt); + goto out; + +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_warn("svcrdma: receive: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 1); + +out: + svc_xprt_put(&xprt->sc_xprt); } -/* - * Receive Queue Completion Handler - * - * Since an RQ completion handler is called on interrupt context, we - * need to defer the handling of the I/O to a tasklet - */ -static void rq_comp_handler(struct ib_cq *cq, void *cq_context) +static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt, + struct ib_wc *wc, + const char *opname) { - struct svcxprt_rdma *xprt = cq_context; - unsigned long flags; - - /* Guard against unconditional flush call for destroyed QP */ - if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) - return; + if (wc->status != IB_WC_SUCCESS) + goto err; - /* - * Set the bit regardless of whether or not it's on the list - * because it may be on the list already due to an SQ - * completion. - */ - set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); +out: + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + return; + +err: + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: %s: %s (%u/0x%x)\n", + opname, ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + goto out; +} - /* - * If this transport is not already on the DTO transport queue, - * add it - */ - spin_lock_irqsave(&dto_lock, flags); - if (list_empty(&xprt->sc_dto_q)) { - svc_xprt_get(&xprt->sc_xprt); - list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); - } - spin_unlock_irqrestore(&dto_lock, flags); +static void svc_rdma_send_wc_common_put(struct ib_cq *cq, struct ib_wc *wc, + const char *opname) +{ + struct svcxprt_rdma *xprt = cq->cq_context; - /* Tasklet does all the work to avoid irqsave locks. */ - tasklet_schedule(&dto_tasklet); + svc_rdma_send_wc_common(xprt, wc, opname); + svc_xprt_put(&xprt->sc_xprt); } -/* - * rq_cq_reap - Process the RQ CQ. - * - * Take all completing WC off the CQE and enqueue the associated DTO - * context on the dto_q for the transport. +/** + * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC + * @cq: completion queue + * @wc: completed WR * - * Note that caller must hold a transport reference. */ -static void rq_cq_reap(struct svcxprt_rdma *xprt) +void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { - int ret; - struct ib_wc wc; - struct svc_rdma_op_ctxt *ctxt = NULL; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_op_ctxt *ctxt; - if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) - return; + svc_rdma_send_wc_common_put(cq, wc, "send"); - ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); - atomic_inc(&rdma_stat_rq_poll); + ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); +} - while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - ctxt->wc_status = wc.status; - ctxt->byte_len = wc.byte_len; - svc_rdma_unmap_dma(ctxt); - if (wc.status != IB_WC_SUCCESS) { - /* Close the transport */ - dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_rdma_put_context(ctxt, 1); - svc_xprt_put(&xprt->sc_xprt); - continue; - } - spin_lock_bh(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - svc_xprt_put(&xprt->sc_xprt); - } +/** + * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC + * @cq: completion queue + * @wc: completed WR + * + */ +void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_op_ctxt *ctxt; - if (ctxt) - atomic_inc(&rdma_stat_rq_prod); + svc_rdma_send_wc_common_put(cq, wc, "write"); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - /* - * If data arrived before established event, - * don't enqueue. This defers RPC I/O until the - * RDMA connection is complete. - */ - if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) - svc_xprt_enqueue(&xprt->sc_xprt); + ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); } -/* - * Process a completion context +/** + * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC + * @cq: completion queue + * @wc: completed WR + * */ -static void process_context(struct svcxprt_rdma *xprt, - struct svc_rdma_op_ctxt *ctxt) +void svc_rdma_wc_reg(struct ib_cq *cq, struct ib_wc *wc) { - struct svc_rdma_op_ctxt *read_hdr; - int free_pages = 0; - - svc_rdma_unmap_dma(ctxt); + svc_rdma_send_wc_common_put(cq, wc, "fastreg"); +} - switch (ctxt->wr_op) { - case IB_WR_SEND: - free_pages = 1; - break; +/** + * svc_rdma_wc_read - Invoked by RDMA provider for each polled Read WC + * @cq: completion queue + * @wc: completed WR + * + */ +void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *xprt = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_op_ctxt *ctxt; - case IB_WR_RDMA_WRITE: - break; + svc_rdma_send_wc_common(xprt, wc, "read"); - case IB_WR_RDMA_READ: - case IB_WR_RDMA_READ_WITH_INV: - svc_rdma_put_frmr(xprt, ctxt->frmr); + ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_frmr(xprt, ctxt->frmr); - if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) - break; + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr; read_hdr = ctxt->read_hdr; - svc_rdma_put_context(ctxt, 0); - - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + spin_lock(&xprt->sc_rq_dto_lock); list_add_tail(&read_hdr->dto_q, &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - svc_xprt_enqueue(&xprt->sc_xprt); - return; + spin_unlock(&xprt->sc_rq_dto_lock); - default: - dprintk("svcrdma: unexpected completion opcode=%d\n", - ctxt->wr_op); - break; + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&xprt->sc_xprt); } - svc_rdma_put_context(ctxt, free_pages); + svc_rdma_put_context(ctxt, 0); + svc_xprt_put(&xprt->sc_xprt); } -/* - * Send Queue Completion Handler - potentially called on interrupt context. +/** + * svc_rdma_wc_inv - Invoked by RDMA provider for each polled LOCAL_INV WC + * @cq: completion queue + * @wc: completed WR * - * Note that caller must hold a transport reference. */ -static void sq_cq_reap(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_op_ctxt *ctxt = NULL; - struct ib_wc wc_a[6]; - struct ib_wc *wc; - struct ib_cq *cq = xprt->sc_sq_cq; - int ret; - - memset(wc_a, 0, sizeof(wc_a)); - - if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) - return; - - ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); - atomic_inc(&rdma_stat_sq_poll); - while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) { - int i; - - for (i = 0; i < ret; i++) { - wc = &wc_a[i]; - if (wc->status != IB_WC_SUCCESS) { - dprintk("svcrdma: sq wc err status %s (%d)\n", - ib_wc_status_msg(wc->status), - wc->status); - - /* Close the transport */ - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - } - - /* Decrement used SQ WR count */ - atomic_dec(&xprt->sc_sq_count); - wake_up(&xprt->sc_send_wait); - - ctxt = (struct svc_rdma_op_ctxt *) - (unsigned long)wc->wr_id; - if (ctxt) - process_context(xprt, ctxt); - - svc_xprt_put(&xprt->sc_xprt); - } - } - - if (ctxt) - atomic_inc(&rdma_stat_sq_prod); -} - -static void sq_comp_handler(struct ib_cq *cq, void *cq_context) +void svc_rdma_wc_inv(struct ib_cq *cq, struct ib_wc *wc) { - struct svcxprt_rdma *xprt = cq_context; - unsigned long flags; - - /* Guard against unconditional flush call for destroyed QP */ - if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) - return; - - /* - * Set the bit regardless of whether or not it's on the list - * because it may be on the list already due to an RQ - * completion. - */ - set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); - - /* - * If this transport is not already on the DTO transport queue, - * add it - */ - spin_lock_irqsave(&dto_lock, flags); - if (list_empty(&xprt->sc_dto_q)) { - svc_xprt_get(&xprt->sc_xprt); - list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); - } - spin_unlock_irqrestore(&dto_lock, flags); - - /* Tasklet does all the work to avoid irqsave locks. */ - tasklet_schedule(&dto_tasklet); + svc_rdma_send_wc_common_put(cq, wc, "localInv"); } static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, @@ -681,6 +585,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) ctxt = svc_rdma_get_context(xprt); buflen = 0; ctxt->direction = DMA_FROM_DEVICE; + ctxt->cqe.done = svc_rdma_wc_receive; for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { if (sge_no >= xprt->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); @@ -705,7 +610,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) recv_wr.next = NULL; recv_wr.sg_list = &ctxt->sge[0]; recv_wr.num_sge = ctxt->count; - recv_wr.wr_id = (u64)(unsigned long)ctxt; + recv_wr.wr_cqe = &ctxt->cqe; svc_xprt_get(&xprt->sc_xprt); ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); @@ -722,6 +627,21 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) return -ENOMEM; } +int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) +{ + int ret = 0; + + ret = svc_rdma_post_recv(xprt, flags); + if (ret) { + pr_err("svcrdma: could not post a receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", xprt); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + ret = -ENOTCONN; + } + return ret; +} + /* * This function handles the CONNECT_REQUEST event on a listening * endpoint. It is passed the cma_id for the _new_ connection. The context in @@ -1011,7 +931,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; - struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; struct ib_device *dev; unsigned int i; @@ -1069,22 +988,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_sq_depth; - newxprt->sc_sq_cq = ib_create_cq(dev, - sq_comp_handler, - cq_event_handler, - newxprt, - &cq_attr); + newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_rq_depth; - newxprt->sc_rq_cq = ib_create_cq(dev, - rq_comp_handler, - cq_event_handler, - newxprt, - &cq_attr); + newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -1173,13 +1084,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; - /* - * Arm the CQs for the SQ and RQ before accepting so we can't - * miss the first message - */ - ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); - ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); - /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); @@ -1319,10 +1223,10 @@ static void __svc_rdma_free(struct work_struct *work) ib_destroy_qp(rdma->sc_qp); if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) - ib_destroy_cq(rdma->sc_sq_cq); + ib_free_cq(rdma->sc_sq_cq); if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) - ib_destroy_cq(rdma->sc_rq_cq); + ib_free_cq(rdma->sc_rq_cq); if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); @@ -1383,9 +1287,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); - /* See if we can opportunistically reap SQ WR to make room */ - sq_cq_reap(xprt); - /* Wait until SQ WR available if SQ still full */ wait_event(xprt->sc_send_wait, atomic_read(&xprt->sc_sq_count) < @@ -1418,57 +1319,3 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) } return ret; } - -void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err) -{ - struct ib_send_wr err_wr; - struct page *p; - struct svc_rdma_op_ctxt *ctxt; - __be32 *va; - int length; - int ret; - - p = alloc_page(GFP_KERNEL); - if (!p) - return; - va = page_address(p); - - /* XDR encode error */ - length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); - - ctxt = svc_rdma_get_context(xprt); - ctxt->direction = DMA_FROM_DEVICE; - ctxt->count = 1; - ctxt->pages[0] = p; - - /* Prepare SGE for local address */ - ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, - p, 0, length, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { - put_page(p); - svc_rdma_put_context(ctxt, 1); - return; - } - atomic_inc(&xprt->sc_dma_used); - ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->sge[0].length = length; - - /* Prepare SEND WR */ - memset(&err_wr, 0, sizeof err_wr); - ctxt->wr_op = IB_WR_SEND; - err_wr.wr_id = (unsigned long)ctxt; - err_wr.sg_list = ctxt->sge; - err_wr.num_sge = 1; - err_wr.opcode = IB_WR_SEND; - err_wr.send_flags = IB_SEND_SIGNALED; - - /* Post It */ - ret = svc_rdma_send(xprt, &err_wr); - if (ret) { - dprintk("svcrdma: Error %d posting send for protocol error\n", - ret); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - } -} diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 878f1bfb1..f5ed9f982 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -112,89 +112,65 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) } } +/** + * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + */ static void -rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) +rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_ep *ep = context; - - pr_err("RPC: %s: %s on device %s ep %p\n", - __func__, ib_event_msg(event->event), - event->device->name, context); - if (ep->rep_connected == 1) { - ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); - wake_up_all(&ep->rep_connect_wait); - } + /* WARNING: Only wr_cqe and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("rpcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); } static void -rpcrdma_sendcq_process_wc(struct ib_wc *wc) +rpcrdma_receive_worker(struct work_struct *work) { - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { - if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("RPC: %s: SEND: %s\n", - __func__, ib_wc_status_msg(wc->status)); - } else { - struct rpcrdma_mw *r; + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); - r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->mw_sendcompletion(wc); - } + rpcrdma_reply_handler(rep); } -/* The common case is a single send completion is waiting. By - * passing two WC entries to ib_poll_cq, a return code of 1 - * means there is exactly one WC waiting and no more. We don't - * have to invoke ib_poll_cq again to know that the CQ has been - * properly drained. +/* Perform basic sanity checking to avoid using garbage + * to update the credit grant value. */ static void -rpcrdma_sendcq_poll(struct ib_cq *cq) +rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) { - struct ib_wc *pos, wcs[2]; - int count, rc; + struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf); + struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; + u32 credits; - do { - pos = wcs; + if (rep->rr_len < RPCRDMA_HDRLEN_ERR) + return; - rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); - if (rc < 0) - break; + credits = be32_to_cpu(rmsgp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > buffer->rb_max_requests) + credits = buffer->rb_max_requests; - count = rc; - while (count-- > 0) - rpcrdma_sendcq_process_wc(pos++); - } while (rc == ARRAY_SIZE(wcs)); - return; + atomic_set(&buffer->rb_credits, credits); } -/* Handle provider send completion upcalls. +/** + * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC + * @cq: completion queue (ignored) + * @wc: completed WR + * */ static void -rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) +rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) { - do { - rpcrdma_sendcq_poll(cq); - } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0); -} - -static void -rpcrdma_receive_worker(struct work_struct *work) -{ - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); - - rpcrdma_reply_handler(rep); -} - -static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc) -{ - struct rpcrdma_rep *rep = - (struct rpcrdma_rep *)(unsigned long)wc->wr_id; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, + rr_cqe); /* WARNING: Only wr_id and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) @@ -211,7 +187,8 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc) ib_dma_sync_single_for_cpu(rep->rr_device, rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); - prefetch(rdmab_to_msg(rep->rr_rdmabuf)); + + rpcrdma_update_granted_credits(rep); out_schedule: queue_work(rpcrdma_receive_wq, &rep->rr_work); @@ -219,57 +196,20 @@ out_schedule: out_fail: if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("RPC: %s: rep %p: %s\n", - __func__, rep, ib_wc_status_msg(wc->status)); + pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); rep->rr_len = RPCRDMA_BAD_LEN; goto out_schedule; } -/* The wc array is on stack: automatic memory is always CPU-local. - * - * struct ib_wc is 64 bytes, making the poll array potentially - * large. But this is at the bottom of the call chain. Further - * substantial work is done in another thread. - */ -static void -rpcrdma_recvcq_poll(struct ib_cq *cq) -{ - struct ib_wc *pos, wcs[4]; - int count, rc; - - do { - pos = wcs; - - rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); - if (rc < 0) - break; - - count = rc; - while (count-- > 0) - rpcrdma_recvcq_process_wc(pos++); - } while (rc == ARRAY_SIZE(wcs)); -} - -/* Handle provider receive completion upcalls. - */ -static void -rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) -{ - do { - rpcrdma_recvcq_poll(cq); - } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0); -} - static void rpcrdma_flush_cqs(struct rpcrdma_ep *ep) { struct ib_wc wc; while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) - rpcrdma_recvcq_process_wc(&wc); - while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0) - rpcrdma_sendcq_process_wc(&wc); + rpcrdma_receive_wc(NULL, &wc); } static int @@ -330,6 +270,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connected: dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); + atomic_set(&xprt->rx_buf.rb_credits, 1); ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); @@ -560,9 +501,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { struct ib_cq *sendcq, *recvcq; - struct ib_cq_init_attr cq_attr = {}; unsigned int max_qp_wr; - int rc, err; + int rc; if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { dprintk("RPC: %s: insufficient sge's available\n", @@ -614,9 +554,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1; - sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall, - rpcrdma_cq_async_error_upcall, NULL, &cq_attr); + sendcq = ib_alloc_cq(ia->ri_device, NULL, + ep->rep_attr.cap.max_send_wr + 1, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", @@ -624,16 +564,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, goto out1; } - rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); - if (rc) { - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", - __func__, rc); - goto out2; - } - - cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; - recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall, - rpcrdma_cq_async_error_upcall, NULL, &cq_attr); + recvcq = ib_alloc_cq(ia->ri_device, NULL, + ep->rep_attr.cap.max_recv_wr + 1, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -641,14 +574,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, goto out2; } - rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); - if (rc) { - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", - __func__, rc); - ib_destroy_cq(recvcq); - goto out2; - } - ep->rep_attr.send_cq = sendcq; ep->rep_attr.recv_cq = recvcq; @@ -673,10 +598,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, return 0; out2: - err = ib_destroy_cq(sendcq); - if (err) - dprintk("RPC: %s: ib_destroy_cq returned %i\n", - __func__, err); + ib_free_cq(sendcq); out1: if (ia->ri_dma_mr) ib_dereg_mr(ia->ri_dma_mr); @@ -711,15 +633,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } - rc = ib_destroy_cq(ep->rep_attr.recv_cq); - if (rc) - dprintk("RPC: %s: ib_destroy_cq returned %i\n", - __func__, rc); - - rc = ib_destroy_cq(ep->rep_attr.send_cq); - if (rc) - dprintk("RPC: %s: ib_destroy_cq returned %i\n", - __func__, rc); + ib_free_cq(ep->rep_attr.recv_cq); + ib_free_cq(ep->rep_attr.send_cq); if (ia->ri_dma_mr) { rc = ib_dereg_mr(ia->ri_dma_mr); @@ -898,6 +813,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); + req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; return req; } @@ -923,6 +839,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) } rep->rr_device = ia->ri_device; + rep->rr_cqe.done = rpcrdma_receive_wc; rep->rr_rxprt = r_xprt; INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); return rep; @@ -943,6 +860,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_lock); + atomic_set(&buf->rb_credits, 1); rc = ia->ri_ops->ro_init(r_xprt); if (rc) @@ -1259,7 +1177,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; + send_wr.wr_cqe = &req->rl_cqe; send_wr.sg_list = iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; @@ -1297,7 +1215,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, int rc; recv_wr.next = NULL; - recv_wr.wr_id = (u64) (unsigned long) rep; + recv_wr.wr_cqe = &rep->rr_cqe; recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; recv_wr.num_sge = 1; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 38fe11b09..2ebc743cb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -95,10 +95,6 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) -/* Force completion handler to ignore the signal - */ -#define RPCRDMA_IGNORE_COMPLETION (0ULL) - /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are * allocated when the forward channel is set up. @@ -171,6 +167,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) struct rpcrdma_buffer; struct rpcrdma_rep { + struct ib_cqe rr_cqe; unsigned int rr_len; struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; @@ -204,11 +201,11 @@ struct rpcrdma_frmr { struct scatterlist *sg; int sg_nents; struct ib_mr *fr_mr; + struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; + struct completion fr_linv_done; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; - bool fr_waiter; - struct completion fr_linv_done;; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; @@ -224,8 +221,7 @@ struct rpcrdma_mw { union { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; - } r; - void (*mw_sendcompletion)(struct ib_wc *); + }; struct list_head mw_list; struct list_head mw_all; }; @@ -281,6 +277,7 @@ struct rpcrdma_req { struct rpcrdma_regbuf *rl_sendbuf; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct ib_cqe rl_cqe; struct list_head rl_all; bool rl_backchannel; }; @@ -311,6 +308,7 @@ struct rpcrdma_buffer { struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; + atomic_t rb_credits; /* most recent credit grant */ u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fde2138b8..65e759569 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1844,9 +1844,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) */ static void xs_local_rpcbind(struct rpc_task *task) { - rcu_read_lock(); - xprt_set_bound(rcu_dereference(task->tk_client->cl_xprt)); - rcu_read_unlock(); + xprt_set_bound(task->tk_xprt); } static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8b5833c1f..b7e01d88b 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -305,6 +305,8 @@ static void switchdev_port_attr_set_deferred(struct net_device *dev, if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); + if (attr->complete) + attr->complete(dev, err, attr->complete_priv); } static int switchdev_port_attr_set_defer(struct net_device *dev, @@ -434,6 +436,8 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev, if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", err, obj->id); + if (obj->complete) + obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_add_defer(struct net_device *dev, @@ -502,6 +506,8 @@ static void switchdev_port_obj_del_deferred(struct net_device *dev, if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", err, obj->id); + if (obj->complete) + obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_del_defer(struct net_device *dev, @@ -1079,7 +1085,7 @@ nla_put_failure: * @filter_dev: filter device * @idx: * - * Delete FDB entry from switch device. + * Dump FDB entries from switch device. */ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index e40110836..ae469b37d 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -412,11 +412,6 @@ enomem: return -ENOMEM; } -void tipc_bcast_reinit(struct net *net) -{ - tipc_link_reinit(tipc_bc_sndlink(net), tipc_own_addr(net)); -} - void tipc_bcast_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 1944c6c00..d5e79b376 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -46,7 +46,6 @@ struct tipc_node_map; extern const char tipc_bclink_name[]; int tipc_bcast_init(struct net *net); -void tipc_bcast_reinit(struct net *net); void tipc_bcast_stop(struct net *net); void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, struct sk_buff_head *xmitq); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 802ffad32..27a540621 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -40,6 +40,7 @@ #include "link.h" #include "discover.h" #include "bcast.h" +#include "netlink.h" #define MAX_ADDR_STR 60 @@ -54,23 +55,6 @@ static struct tipc_media * const media_info_array[] = { NULL }; -static const struct nla_policy -tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { - [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_BEARER_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_BEARER_NAME - }, - [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } -}; - -static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { - [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, - [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } -}; - static void bearer_disable(struct net *net, struct tipc_bearer *b); /** diff --git a/net/tipc/core.c b/net/tipc/core.c index 03a842870..e2bdb07a4 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -69,6 +69,7 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; + INIT_LIST_HEAD(&tn->dist_queue); err = tipc_topsrv_start(net); if (err) goto out_subscr; diff --git a/net/tipc/core.h b/net/tipc/core.h index 5504d6350..eff58dc53 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -103,6 +103,9 @@ struct tipc_net { spinlock_t nametbl_lock; struct name_table *nametbl; + /* Name dist queue */ + struct list_head dist_queue; + /* Topology subscription server */ struct tipc_server *topsrv; atomic_t subscription_count; diff --git a/net/tipc/link.c b/net/tipc/link.c index 347cdc99e..7d2bb3e70 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1,7 +1,7 @@ /* * net/tipc/link.c: TIPC link code * - * Copyright (c) 1996-2007, 2012-2015, Ericsson AB + * Copyright (c) 1996-2007, 2012-2016, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -123,11 +123,11 @@ struct tipc_stats { struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; - struct tipc_media_addr *media_addr; struct net *net; /* Management and link supervision data */ u32 peer_session; + u32 session; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; @@ -137,11 +137,7 @@ struct tipc_link { u16 peer_caps; bool active; u32 silent_intv_cnt; - struct { - unchar hdr[INT_H_SIZE]; - unchar body[TIPC_MAX_IF_NAME]; - } proto_msg; - struct tipc_msg *pmsg; + char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; @@ -196,14 +192,6 @@ struct tipc_link { static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; -/* Properties valid for media, bearar and link */ -static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { - [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } -}; - /* Send states for broadcast NACKs */ enum { @@ -216,10 +204,11 @@ enum { * Interval between NACKs when packets arrive out of order */ #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) -/* - * Out-of-range value for link session numbers + +/* Wildcard value for link session numbers. When it is known that + * peer endpoint is down, any session number must be accepted. */ -#define WILDCARD_SESSION 0x10000 +#define ANY_SESSION 0x10000 /* Link FSM states: */ @@ -399,16 +388,6 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } -static u32 link_own_addr(struct tipc_link *l) -{ - return msg_prevnode(l->pmsg); -} - -void tipc_link_reinit(struct tipc_link *l, u32 addr) -{ - msg_set_prevnode(l->pmsg, addr); -} - /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -442,29 +421,22 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct tipc_link **link) { struct tipc_link *l; - struct tipc_msg *hdr; l = kzalloc(sizeof(*l), GFP_ATOMIC); if (!l) return false; *link = l; - l->pmsg = (struct tipc_msg *)&l->proto_msg; - hdr = l->pmsg; - tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer); - msg_set_size(hdr, sizeof(l->proto_msg)); - msg_set_session(hdr, session); - msg_set_bearer_id(hdr, l->bearer_id); + l->session = session; /* Note: peer i/f name is completed by reset/activate message */ sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); - strcpy((char *)msg_data(hdr), if_name); - + strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; l->net = net; - l->peer_session = WILDCARD_SESSION; + l->peer_session = ANY_SESSION; l->bearer_id = bearer_id; l->tolerance = tolerance; l->net_plane = net_plane; @@ -791,7 +763,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) struct tipc_msg *msg = buf_msg(skb_peek(list)); int imp = msg_importance(msg); u32 oport = msg_origport(msg); - u32 addr = link_own_addr(link); + u32 addr = tipc_own_addr(link->net); struct sk_buff *skb; /* This really cannot happen... */ @@ -840,16 +812,9 @@ void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { - /* Link is down, accept any session */ - l->peer_session = WILDCARD_SESSION; - - /* If peer is up, it only accepts an incremented session number */ - msg_set_session(l->pmsg, msg_session(l->pmsg) + 1); - - /* Prepare for renewed mtu size negotiation */ + l->peer_session = ANY_SESSION; + l->session++; l->mtu = l->advertised_mtu; - - /* Clean up all queues and counters: */ __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); skb_queue_splice_init(&l->wakeupq, l->inputq); @@ -904,8 +869,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) return link_schedule_user(l, list); } - if (unlikely(msg_size(hdr) > mtu)) + if (unlikely(msg_size(hdr) > mtu)) { + skb_queue_purge(list); return -EMSGSIZE; + } /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { @@ -917,8 +884,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (likely(skb_queue_len(transmq) < maxwin)) { _skb = skb_clone(skb, GFP_ATOMIC); - if (!_skb) + if (!_skb) { + skb_queue_purge(list); return -ENOBUFS; + } __skb_dequeue(list); __skb_queue_tail(transmq, skb); __skb_queue_tail(xmitq, _skb); @@ -1153,7 +1122,7 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) /* Broadcast ACK must be sent via a unicast link => defer to caller */ if (link_is_bc_rcvlink(l)) { - if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf) + if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf) return 0; l->rcv_unacked = 0; return TIPC_LINK_SND_BC_ACK; @@ -1261,39 +1230,34 @@ drop: return rc; } -/* - * Send protocol message to the other endpoint. - */ -static void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, - int probe_msg, u32 gap, u32 tolerance, - u32 priority) -{ - struct sk_buff *skb = NULL; - struct sk_buff_head xmitq; - - __skb_queue_head_init(&xmitq); - tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap, - tolerance, priority, &xmitq); - skb = __skb_dequeue(&xmitq); - if (!skb) - return; - tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr); - l->rcv_unacked = 0; -} - static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) { - struct sk_buff *skb = NULL; - struct tipc_msg *hdr = l->pmsg; + struct sk_buff *skb; + struct tipc_msg *hdr; + struct sk_buff_head *dfq = &l->deferdq; bool node_up = link_is_up(l->bc_rcvlink); /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) return; - msg_set_type(hdr, mtyp); + if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) + return; + + if (!skb_queue_empty(dfq)) + rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; + + skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, + TIPC_MAX_IF_NAME, l->addr, + tipc_own_addr(l->net), 0, 0, 0); + if (!skb) + return; + + hdr = buf_msg(skb); + msg_set_session(hdr, l->session); + msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); msg_set_next_sent(hdr, l->snd_nxt); msg_set_ack(hdr, l->rcv_nxt - 1); @@ -1303,36 +1267,23 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_linkprio(hdr, priority); msg_set_redundant_link(hdr, node_up); msg_set_seq_gap(hdr, 0); - - /* Compatibility: created msg must not be in sequence with pkt flow */ msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { - if (!tipc_link_is_up(l)) - return; - - /* Override rcvgap if there are packets in deferred queue */ - if (!skb_queue_empty(&l->deferdq)) - rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt; - if (rcvgap) { - msg_set_seq_gap(hdr, rcvgap); - l->stats.sent_nacks++; - } + msg_set_seq_gap(hdr, rcvgap); + msg_set_size(hdr, INT_H_SIZE); msg_set_probe(hdr, probe); - if (probe) - l->stats.sent_probes++; l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - msg_set_ack(hdr, l->rcv_nxt - 1); - msg_set_next_sent(hdr, 1); + strcpy(msg_data(hdr), l->if_name); } - skb = tipc_buf_acquire(msg_size(hdr)); - if (!skb) - return; - skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); + if (probe) + l->stats.sent_probes++; + if (rcvgap) + l->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); } @@ -1357,7 +1308,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, /* At least one packet required for safe algorithm => add dummy */ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, - BASIC_H_SIZE, 0, l->addr, link_own_addr(l), + BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), 0, 0, TIPC_ERR_NO_PORT); if (!skb) { pr_warn("%sunable to create tunnel packet\n", link_co_err); @@ -1368,7 +1319,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, __skb_queue_purge(&tmpxq); /* Initialize reusable tunnel packet header */ - tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL, + tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq); msg_set_msgcnt(&tnlhdr, pktcnt); @@ -1427,7 +1378,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_link_is_blocked(l) || !xmitq) goto exit; - if (link_own_addr(l) > msg_prevnode(hdr)) + if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); switch (mtyp) { @@ -1435,7 +1386,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Ignore duplicate RESET with old session number */ if ((less_eq(msg_session(hdr), l->peer_session)) && - (l->peer_session != WILDCARD_SESSION)) + (l->peer_session != ANY_SESSION)) break; /* fall thru' */ @@ -1479,6 +1430,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) l->tolerance = peers_tol; + if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI, + TIPC_MAX_LINK_PRI)) { + l->priority = peers_prio; + rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + } + l->silent_intv_cnt = 0; l->stats.recv_states++; if (msg_probe(hdr)) @@ -1526,7 +1483,7 @@ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast, u16 gap_to = peers_snd_nxt - 1; skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, - 0, l->addr, link_own_addr(l), 0, 0, 0); + 0, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return false; hdr = buf_msg(skb); @@ -1681,7 +1638,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, if (mtyp != STATE_MSG) return 0; - if (dnode == link_own_addr(l)) { + if (dnode == tipc_own_addr(l->net)) { tipc_link_bc_ack_rcv(l, acked, xmitq); rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq); l->stats.recv_nacks++; @@ -2023,16 +1980,18 @@ msg_full: return -EMSGSIZE; } -void tipc_link_set_tolerance(struct tipc_link *l, u32 tol) +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, + struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_proto_xmit(l, STATE_MSG, 0, 0, tol, 0); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq); } -void tipc_link_set_prio(struct tipc_link *l, u32 prio) +void tipc_link_set_prio(struct tipc_link *l, u32 prio, + struct sk_buff_head *xmitq) { l->priority = prio; - tipc_link_proto_xmit(l, STATE_MSG, 0, 0, 0, prio); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq); } void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) diff --git a/net/tipc/link.h b/net/tipc/link.h index b2ae0f427..6a94175ee 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -86,7 +86,6 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, struct tipc_link **link); -void tipc_link_reinit(struct tipc_link *l, u32 addr); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); @@ -112,8 +111,10 @@ char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); unsigned long tipc_link_tolerance(struct tipc_link *l); -void tipc_link_set_tolerance(struct tipc_link *l, u32 tol); -void tipc_link_set_prio(struct tipc_link *l, u32 prio); +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, + struct sk_buff_head *xmitq); +void tipc_link_set_prio(struct tipc_link *l, u32 prio, + struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index ebe9d0ff6..6b626a64b 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -40,11 +40,6 @@ int sysctl_tipc_named_timeout __read_mostly = 2000; -/** - * struct tipc_dist_queue - queue holding deferred name table updates - */ -static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue); - struct distr_queue_item { struct distr_item i; u32 dtype; @@ -229,12 +224,31 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) kfree_rcu(p, rcu); } +/** + * tipc_dist_queue_purge - remove deferred updates from a node that went down + */ +static void tipc_dist_queue_purge(struct net *net, u32 addr) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct distr_queue_item *e, *tmp; + + spin_lock_bh(&tn->nametbl_lock); + list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { + if (e->node != addr) + continue; + list_del(&e->next); + kfree(e); + } + spin_unlock_bh(&tn->nametbl_lock); +} + void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) { struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list) tipc_publ_purge(net, publ, addr); + tipc_dist_queue_purge(net, addr); } /** @@ -279,9 +293,11 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, * tipc_named_add_backlog - add a failed name table update to the backlog * */ -static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) +static void tipc_named_add_backlog(struct net *net, struct distr_item *i, + u32 type, u32 node) { struct distr_queue_item *e; + struct tipc_net *tn = net_generic(net, tipc_net_id); unsigned long now = get_jiffies_64(); e = kzalloc(sizeof(*e), GFP_ATOMIC); @@ -291,7 +307,7 @@ static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) e->node = node; e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); memcpy(e, i, sizeof(*i)); - list_add_tail(&e->next, &tipc_dist_queue); + list_add_tail(&e->next, &tn->dist_queue); } /** @@ -301,10 +317,11 @@ static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) void tipc_named_process_backlog(struct net *net) { struct distr_queue_item *e, *tmp; + struct tipc_net *tn = net_generic(net, tipc_net_id); char addr[16]; unsigned long now = get_jiffies_64(); - list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) { + list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { if (time_after(e->expires, now)) { if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) continue; @@ -344,7 +361,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) node = msg_orignode(msg); while (count--) { if (!tipc_update_nametbl(net, item, node, mtype)) - tipc_named_add_backlog(item, mtype, node); + tipc_named_add_backlog(net, item, mtype, node); item++; } kfree_skb(skb); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 91fce7029..e190460fe 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -47,12 +47,6 @@ #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ -static const struct nla_policy -tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { - [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } -}; - /** * struct name_info - name sequence publication info * @node_list: circular list of publications made by own node @@ -418,6 +412,9 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, struct tipc_subscription *s) { struct sub_seq *sseq = nseq->sseqs; + struct tipc_name_seq ns; + + tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); list_add(&s->nameseq_list, &nseq->subscriptions); @@ -425,7 +422,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) { + if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; @@ -722,9 +719,10 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, void tipc_nametbl_subscribe(struct tipc_subscription *s) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); - u32 type = s->seq.type; + u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); int index = hash(type); struct name_seq *seq; + struct tipc_name_seq ns; spin_lock_bh(&tn->nametbl_lock); seq = nametbl_find_seq(s->net, type); @@ -735,8 +733,9 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s) tipc_nameseq_subscribe(seq, s); spin_unlock_bh(&seq->lock); } else { + tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); pr_warn("Failed to create subscription for {%u,%u,%u}\n", - s->seq.type, s->seq.lower, s->seq.upper); + ns.type, ns.lower, ns.upper); } spin_unlock_bh(&tn->nametbl_lock); } @@ -748,9 +747,10 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); struct name_seq *seq; + u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, s->seq.type); + seq = nametbl_find_seq(s->net, type); if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); diff --git a/net/tipc/net.c b/net/tipc/net.c index 77bf9113c..28bf4feeb 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -41,11 +41,7 @@ #include "socket.h" #include "node.h" #include "bcast.h" - -static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { - [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NET_ID] = { .type = NLA_U32 } -}; +#include "netlink.h" /* * The TIPC locking policy is designed to ensure a very fine locking @@ -116,7 +112,6 @@ int tipc_net_start(struct net *net, u32 addr) tn->own_addr = addr; tipc_named_reinit(net); tipc_sk_reinit(net); - tipc_bcast_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, TIPC_ZONE_SCOPE, 0, tn->own_addr); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 8975b0135..56935df21 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -55,6 +55,75 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, } }; +const struct nla_policy +tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { + [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } +}; + +const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { + [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, + [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } +}; + +const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { + [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NET_ID] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { + [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING, + .len = TIPC_MAX_LINK_NAME }, + [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { + [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } +}; + +/* Properties valid for media, bearer and link */ +const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { + [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { + [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING, + .len = TIPC_MAX_BEARER_NAME }, + [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { + [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, + [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } +}; + +const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { + [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, + [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, + [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, +}; + /* Users of the legacy API (tipc-config) can't handle that we add operations, * so we have a separate genl handling for the new API. */ diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 08a1db67b..ed1dbcb4a 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -35,6 +35,7 @@ #ifndef _TIPC_NETLINK_H #define _TIPC_NETLINK_H +#include <net/netlink.h> extern struct genl_family tipc_genl_family; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); @@ -45,6 +46,16 @@ struct tipc_nl_msg { u32 seq; }; +extern const struct nla_policy tipc_nl_name_table_policy[]; +extern const struct nla_policy tipc_nl_sock_policy[]; +extern const struct nla_policy tipc_nl_net_policy[]; +extern const struct nla_policy tipc_nl_link_policy[]; +extern const struct nla_policy tipc_nl_node_policy[]; +extern const struct nla_policy tipc_nl_prop_policy[]; +extern const struct nla_policy tipc_nl_bearer_policy[]; +extern const struct nla_policy tipc_nl_media_policy[]; +extern const struct nla_policy tipc_nl_udp_policy[]; + int tipc_netlink_start(void); int tipc_netlink_compat_start(void); void tipc_netlink_stop(void); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 2c016fdef..d7d050f44 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1104,8 +1104,8 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) req_nlh = (struct nlmsghdr *)skb->data; msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; - msg.dst_sk = info->dst_sk; msg.net = genl_info_net(info); + msg.dst_sk = skb->sk; if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); diff --git a/net/tipc/node.c b/net/tipc/node.c index 9d7a16fc5..9aaa1bc56 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -41,6 +41,7 @@ #include "socket.h" #include "bcast.h" #include "discover.h" +#include "netlink.h" #define INVALID_NODE_SIG 0x10000 @@ -164,28 +165,6 @@ struct tipc_sock_conn { struct list_head list; }; -static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { - [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_LINK_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_LINK_NAME - }, - [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } -}; - -static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { - [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } -}; - static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; @@ -225,9 +204,10 @@ static unsigned int tipc_hashfn(u32 addr) static void tipc_node_kref_release(struct kref *kref) { - struct tipc_node *node = container_of(kref, struct tipc_node, kref); + struct tipc_node *n = container_of(kref, struct tipc_node, kref); - tipc_node_delete(node); + kfree(n->bc_entry.link); + kfree_rcu(n, rcu); } static void tipc_node_put(struct tipc_node *node) @@ -245,23 +225,23 @@ static void tipc_node_get(struct tipc_node *node) */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node; + unsigned int thash = tipc_hashfn(addr); if (unlikely(!in_own_cluster_exact(net, addr))) return NULL; rcu_read_lock(); - hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)], - hash) { - if (node->addr == addr) { - tipc_node_get(node); - rcu_read_unlock(); - return node; - } + hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { + if (node->addr != addr) + continue; + if (!kref_get_unless_zero(&node->kref)) + node = NULL; + break; } rcu_read_unlock(); - return NULL; + return node; } static void tipc_node_read_lock(struct tipc_node *n) @@ -395,21 +375,20 @@ static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); - kfree(node->bc_entry.link); - kfree_rcu(node, rcu); + tipc_node_put(node); + + del_timer_sync(&node->timer); + tipc_node_put(node); } void tipc_node_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) { - if (del_timer(&node->timer)) - tipc_node_put(node); - tipc_node_put(node); - } + list_for_each_entry_safe(node, t_node, &tn->node_list, list) + tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } @@ -530,9 +509,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) - tipc_node_get(n); - tipc_node_put(n); + mod_timer(&n->timer, jiffies + n->keepalive_intv); } /** @@ -845,7 +822,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); - if (reset && !tipc_link_is_reset(l)) + if (reset && l && !tipc_link_is_reset(l)) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } @@ -1166,7 +1143,7 @@ msg_full: * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) @@ -1174,33 +1151,43 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; - int bearer_id = -1; - int rc = -EHOSTUNREACH; + int bearer_id; + int rc; + + if (in_own_node(net, dnode)) { + tipc_sk_rcv(net, list); + return 0; + } - __skb_queue_head_init(&xmitq); n = tipc_node_find(net, dnode); - if (likely(n)) { - tipc_node_read_lock(n); - bearer_id = n->active_links[selector & 1]; - if (bearer_id >= 0) { - le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); - rc = tipc_link_xmit(le->link, list, &xmitq); - spin_unlock_bh(&le->lock); - } + if (unlikely(!n)) { + skb_queue_purge(list); + return -EHOSTUNREACH; + } + + tipc_node_read_lock(n); + bearer_id = n->active_links[selector & 1]; + if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); - if (likely(!rc)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - else if (rc == -ENOBUFS) - tipc_node_link_down(n, bearer_id, false); tipc_node_put(n); - return rc; + skb_queue_purge(list); + return -EHOSTUNREACH; } - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } + __skb_queue_head_init(&xmitq); + le = &n->links[bearer_id]; + spin_lock_bh(&le->lock); + rc = tipc_link_xmit(le->link, list, &xmitq); + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(n); + + if (likely(rc == 0)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + else if (rc == -ENOBUFS) + tipc_node_link_down(n, bearer_id, false); + + tipc_node_put(n); + return rc; } @@ -1457,6 +1444,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) int bearer_id = b->identity; struct tipc_link_entry *le; u16 bc_ack = msg_bcast_ack(hdr); + u32 self = tipc_own_addr(net); int rc = 0; __skb_queue_head_init(&xmitq); @@ -1473,6 +1461,10 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) return tipc_node_bc_rcv(net, skb, bearer_id); } + /* Discard unicast link messages destined for another node */ + if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) + goto discard; + /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) @@ -1637,9 +1629,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) char *name; struct tipc_link *link; struct tipc_node *node; + struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); + __skb_queue_head_init(&xmitq); + if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; @@ -1683,13 +1678,13 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - tipc_link_set_tolerance(link, tol); + tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); - tipc_link_set_prio(link, prio); + tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 win; @@ -1701,7 +1696,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) out: tipc_node_read_unlock(node); - + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); return res; } diff --git a/net/tipc/server.c b/net/tipc/server.c index 922e04a43..2446bfbaa 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -571,13 +571,13 @@ static void tipc_work_stop(struct tipc_server *s) static int tipc_work_start(struct tipc_server *s) { - s->rcv_wq = alloc_workqueue("tipc_rcv", WQ_UNBOUND, 1); + s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); if (!s->rcv_wq) { pr_err("can't start tipc receive workqueue\n"); return -ENOMEM; } - s->send_wq = alloc_workqueue("tipc_send", WQ_UNBOUND, 1); + s->send_wq = alloc_ordered_workqueue("tipc_send", 0); if (!s->send_wq) { pr_err("can't start tipc send workqueue\n"); destroy_workqueue(s->rcv_wq); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 4d420bb27..3eeb50a27 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -42,6 +42,7 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "netlink.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -126,14 +127,6 @@ static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; static struct proto tipc_proto; -static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { - [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, - [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, - [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } -}; - static const struct rhashtable_params tsk_rht_params; /* diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index f9ff73a8d..e6cb386fb 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -92,25 +92,42 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, +int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper) { - if (found_lower < sub->seq.lower) - found_lower = sub->seq.lower; - if (found_upper > sub->seq.upper) - found_upper = sub->seq.upper; + if (found_lower < seq->lower) + found_lower = seq->lower; + if (found_upper > seq->upper) + found_upper = seq->upper; if (found_lower > found_upper) return 0; return 1; } +u32 tipc_subscrp_convert_seq_type(u32 type, int swap) +{ + return htohl(type, swap); +} + +void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, + struct tipc_name_seq *out) +{ + out->type = htohl(in->type, swap); + out->lower = htohl(in->lower, swap); + out->upper = htohl(in->upper, swap); +} + void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, u32 node, int must) { - if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper)) + struct tipc_name_seq seq; + + tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); + if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; - if (!must && !(sub->filter & TIPC_SUB_PORTS)) + if (!must && + !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS)) return; tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, @@ -171,12 +188,14 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); /* Destroy any existing subscriptions for subscriber */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { - if (del_timer(&sub->timer)) { + timeout = htohl(sub->evt.s.timeout, sub->swap); + if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) { tipc_subscrp_delete(sub); tipc_subscrb_put(subscriber); } @@ -200,13 +219,16 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); /* Find first matching subscription, exit if not found */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - if (del_timer(&sub->timer)) { + timeout = htohl(sub->evt.s.timeout, sub->swap); + if ((timeout == TIPC_WAIT_FOREVER) || + del_timer(&sub->timer)) { tipc_subscrp_delete(sub); tipc_subscrb_put(subscriber); } @@ -216,66 +238,67 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, spin_unlock_bh(&subscriber->lock); } -static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, - struct tipc_subscription **sub_p) +static struct tipc_subscription *tipc_subscrp_create(struct net *net, + struct tipc_subscr *s, + int swap) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; - int swap; - - /* Determine subscriber's endianness */ - swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE)); - - /* Detect & process a subscription cancellation request */ - if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { - s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - tipc_subscrp_cancel(s, subscriber); - return 0; - } + u32 filter = htohl(s->filter, swap); /* Refuse subscription if global limit exceeded */ if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { pr_warn("Subscription rejected, limit reached (%u)\n", TIPC_MAX_SUBSCRIPTIONS); - return -EINVAL; + return NULL; } /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); - return -ENOMEM; + return NULL; } /* Initialize subscription object */ sub->net = net; - sub->seq.type = htohl(s->seq.type, swap); - sub->seq.lower = htohl(s->seq.lower, swap); - sub->seq.upper = htohl(s->seq.upper, swap); - sub->timeout = msecs_to_jiffies(htohl(s->timeout, swap)); - sub->filter = htohl(s->filter, swap); - if ((!(sub->filter & TIPC_SUB_PORTS) == - !(sub->filter & TIPC_SUB_SERVICE)) || - (sub->seq.lower > sub->seq.upper)) { + if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || + (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) { pr_warn("Subscription rejected, illegal request\n"); kfree(sub); - return -EINVAL; + return NULL; } - spin_lock_bh(&subscriber->lock); - list_add(&sub->subscrp_list, &subscriber->subscrp_list); - spin_unlock_bh(&subscriber->lock); - sub->subscriber = subscriber; + sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); + return sub; +} + +static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, int swap) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_subscription *sub = NULL; + u32 timeout; + + sub = tipc_subscrp_create(net, s, swap); + if (!sub) + return tipc_conn_terminate(tn->topsrv, subscriber->conid); + + spin_lock_bh(&subscriber->lock); + list_add(&sub->subscrp_list, &subscriber->subscrp_list); + tipc_subscrb_get(subscriber); + sub->subscriber = subscriber; + tipc_nametbl_subscribe(sub); + spin_unlock_bh(&subscriber->lock); + + timeout = htohl(sub->evt.s.timeout, swap); + if (timeout == TIPC_WAIT_FOREVER) + return; + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); - if (sub->timeout != TIPC_WAIT_FOREVER) - sub->timeout += jiffies; - if (!mod_timer(&sub->timer, sub->timeout)) - tipc_subscrb_get(subscriber); - *sub_p = sub; - return 0; + mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); } /* Handle one termination request for the subscriber */ @@ -289,15 +312,22 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, struct sockaddr_tipc *addr, void *usr_data, void *buf, size_t len) { - struct tipc_subscriber *subscrb = usr_data; - struct tipc_subscription *sub = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_subscriber *subscriber = usr_data; + struct tipc_subscr *s = (struct tipc_subscr *)buf; + int swap; - if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub)) - return tipc_conn_terminate(tn->topsrv, subscrb->conid); + /* Determine subscriber's endianness */ + swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | + TIPC_SUB_CANCEL)); + + /* Detect & process a subscription cancellation request */ + if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { + s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); + return tipc_subscrp_cancel(s, subscriber); + } - if (sub) - tipc_nametbl_subscribe(sub); + if (s) + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 92ee18cc5..be6010308 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -50,21 +50,15 @@ struct tipc_subscriber; * @subscriber: pointer to its subscriber * @seq: name sequence associated with subscription * @net: point to network namespace - * @timeout: duration of subscription (in ms) - * @filter: event filtering to be done for subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list * @subscrp_list: adjacent subscriptions in subscriber's subscription list - * @server_ref: object reference of server port associated with subscription * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription */ struct tipc_subscription { struct tipc_subscriber *subscriber; - struct tipc_name_seq seq; struct net *net; - unsigned long timeout; - u32 filter; struct timer_list timer; struct list_head nameseq_list; struct list_head subscrp_list; @@ -72,11 +66,14 @@ struct tipc_subscription { struct tipc_event evt; }; -int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, +int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, u32 node, int must); +void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, + struct tipc_name_seq *out); +u32 tipc_subscrp_convert_seq_type(u32 type, int swap); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index d63a911e7..c9cf2be36 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -48,19 +48,12 @@ #include <linux/tipc_netlink.h> #include "core.h" #include "bearer.h" +#include "netlink.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 -#define UDP_MIN_HEADROOM 28 - -static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { - [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, - [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, - .len = sizeof(struct sockaddr_storage)}, - [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, - .len = sizeof(struct sockaddr_storage)}, -}; +#define UDP_MIN_HEADROOM 48 /** * struct udp_media_addr - IP/UDP addressing information @@ -181,6 +174,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, err = PTR_ERR(rt); goto tx_error; } + + skb->dev = rt->dst.dev; ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, @@ -201,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, src->udp_port, + &dst->ipv6, 0, ttl, 0, src->udp_port, dst->udp_port, false); #endif } @@ -274,7 +269,7 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, struct udp_media_addr *remote) { struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; - struct sockaddr_storage *sa_local, *sa_remote; + struct sockaddr_storage sa_local, sa_remote; if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; @@ -283,41 +278,48 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, tipc_nl_udp_policy)) goto err; if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { - sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]); - sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]); + nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL], + sizeof(sa_local)); + nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE], + sizeof(sa_remote)); } else { err: pr_err("Invalid UDP bearer configuration"); return -EINVAL; } - if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) { + if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) { struct sockaddr_in *ip4; - ip4 = (struct sockaddr_in *)sa_local; + ip4 = (struct sockaddr_in *)&sa_local; local->proto = htons(ETH_P_IP); local->udp_port = ip4->sin_port; local->ipv4.s_addr = ip4->sin_addr.s_addr; - ip4 = (struct sockaddr_in *)sa_remote; + ip4 = (struct sockaddr_in *)&sa_remote; remote->proto = htons(ETH_P_IP); remote->udp_port = ip4->sin_port; remote->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) - } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) { + } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) { + int atype; struct sockaddr_in6 *ip6; - ip6 = (struct sockaddr_in6 *)sa_local; + ip6 = (struct sockaddr_in6 *)&sa_local; + atype = ipv6_addr_type(&ip6->sin6_addr); + if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) + return -EINVAL; + local->proto = htons(ETH_P_IPV6); local->udp_port = ip6->sin6_port; - local->ipv6 = ip6->sin6_addr; + memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); ub->ifindex = ip6->sin6_scope_id; - ip6 = (struct sockaddr_in6 *)sa_remote; + ip6 = (struct sockaddr_in6 *)&sa_remote; remote->proto = htons(ETH_P_IPV6); remote->udp_port = ip6->sin6_port; - remote->ipv6 = ip6->sin6_addr; + memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f75f847e6..8269da73e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1534,7 +1534,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; unsigned char max_level = 0; - int unix_sock_count = 0; if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1542,11 +1541,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) { struct sock *sk = unix_get_socket(scm->fp->fp[i]); - if (sk) { - unix_sock_count++; + if (sk) max_level = max(max_level, unix_sk(sk)->recursion_level); - } } if (unlikely(max_level > MAX_RECURSION_LEVEL)) return -ETOOMANYREFS; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bbe65dcb9..b5f1221f4 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1209,10 +1209,14 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - goto out_wait_error; + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; - goto out_wait_error; + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + goto out_wait; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -1220,20 +1224,17 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (sk->sk_err) { err = -sk->sk_err; - goto out_wait_error; - } else + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + } else { err = 0; + } out_wait: finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; - -out_wait_error: - sk->sk_state = SS_UNCONNECTED; - sock->state = SS_UNCONNECTED; - goto out_wait; } static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) @@ -1270,18 +1271,20 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) listener->sk_err == 0) { release_sock(listener); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(listener), &wait); lock_sock(listener); if (signal_pending(current)) { err = sock_intr_errno(timeout); - goto out_wait; + goto out; } else if (timeout == 0) { err = -EAGAIN; - goto out_wait; + goto out; } prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); } + finish_wait(sk_sleep(listener), &wait); if (listener->sk_err) err = -listener->sk_err; @@ -1301,19 +1304,15 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) */ if (err) { vconnected->rejected = true; - release_sock(connected); - sock_put(connected); - goto out_wait; + } else { + newsock->state = SS_CONNECTED; + sock_graft(connected, newsock); } - newsock->state = SS_CONNECTED; - sock_graft(connected, newsock); release_sock(connected); sock_put(connected); } -out_wait: - finish_wait(sk_sleep(listener), &wait); out: release_sock(listener); return err; @@ -1557,9 +1556,11 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; + while (total_written < len) { ssize_t written; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && @@ -1568,27 +1569,33 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; - goto out_wait; + finish_wait(sk_sleep(sk), &wait); + goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); - if (err < 0) - goto out_wait; + if (err < 0) { + finish_wait(sk_sleep(sk), &wait); + goto out_err; + } release_sock(sk); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); - finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); - goto out_wait; + finish_wait(sk_sleep(sk), &wait); + goto out_err; } else if (timeout == 0) { err = -EAGAIN; - goto out_wait; + finish_wait(sk_sleep(sk), &wait); + goto out_err; } + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); } + finish_wait(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after @@ -1596,16 +1603,16 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, */ if (sk->sk_err) { err = -sk->sk_err; - goto out_wait; + goto out_err; } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || (vsk->peer_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; - goto out_wait; + goto out_err; } err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) - goto out_wait; + goto out_err; /* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is @@ -1618,7 +1625,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, len - total_written); if (written < 0) { err = -ENOMEM; - goto out_wait; + goto out_err; } total_written += written; @@ -1626,11 +1633,11 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = transport->notify_send_post_enqueue( vsk, written, &send_data); if (err < 0) - goto out_wait; + goto out_err; } -out_wait: +out_err: if (total_written > 0) err = total_written; out: @@ -1715,18 +1722,59 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, while (1) { - s64 ready = vsock_stream_has_data(vsk); + s64 ready; - if (ready < 0) { - /* Invalid queue pair content. XXX This should be - * changed to a connection reset in a later change. - */ + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + ready = vsock_stream_has_data(vsk); - err = -ENOMEM; - goto out; - } else if (ready > 0) { + if (ready == 0) { + if (sk->sk_err != 0 || + (sk->sk_shutdown & RCV_SHUTDOWN) || + (vsk->peer_shutdown & SEND_SHUTDOWN)) { + finish_wait(sk_sleep(sk), &wait); + break; + } + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + finish_wait(sk_sleep(sk), &wait); + break; + } + + err = transport->notify_recv_pre_block( + vsk, target, &recv_data); + if (err < 0) { + finish_wait(sk_sleep(sk), &wait); + break; + } + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + finish_wait(sk_sleep(sk), &wait); + break; + } else if (timeout == 0) { + err = -EAGAIN; + finish_wait(sk_sleep(sk), &wait); + break; + } + } else { ssize_t read; + finish_wait(sk_sleep(sk), &wait); + + if (ready < 0) { + /* Invalid queue pair content. XXX This should + * be changed to a connection reset in a later + * change. + */ + + err = -ENOMEM; + goto out; + } + err = transport->notify_recv_pre_dequeue( vsk, target, &recv_data); if (err < 0) @@ -1752,35 +1800,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, break; target -= read; - } else { - if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) - || (vsk->peer_shutdown & SEND_SHUTDOWN)) { - break; - } - /* Don't wait for non-blocking sockets. */ - if (timeout == 0) { - err = -EAGAIN; - break; - } - - err = transport->notify_recv_pre_block( - vsk, target, &recv_data); - if (err < 0) - break; - - release_sock(sk); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - timeout = schedule_timeout(timeout); - finish_wait(sk_sleep(sk), &wait); - lock_sock(sk); - - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - break; - } else if (timeout == 0) { - err = -EAGAIN; - break; - } } } @@ -1789,27 +1808,8 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, else if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; - if (copied > 0) { - /* We only do these additional bookkeeping/notification steps - * if we actually copied something out of the queue pair - * instead of just peeking ahead. - */ - - if (!(flags & MSG_PEEK)) { - /* If the other side has shutdown for sending and there - * is nothing more to read, then modify the socket - * state. - */ - if (vsk->peer_shutdown & SEND_SHUTDOWN) { - if (vsock_stream_has_data(vsk) <= 0) { - sk->sk_state = SS_UNCONNECTED; - sock_set_flag(sk, SOCK_DONE); - sk->sk_state_change(sk); - } - } - } + if (copied > 0) err = copied; - } out: release_sock(sk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 0a369bb44..56214736f 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -842,7 +842,7 @@ static void vmci_transport_peer_detach_cb(u32 sub_id, * qp_handle. */ if (vmci_handle_is_invalid(e_payload->handle) || - vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) + !vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) return; /* We don't ask for delayed CBs when we subscribe to this event (we @@ -1735,11 +1735,8 @@ static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk, /* Retrieve the head sk_buff from the socket's receive queue. */ err = 0; skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err); - if (err) - return err; - if (!skb) - return -EAGAIN; + return err; dg = (struct vmci_datagram *)skb->data; if (!dg) @@ -2154,7 +2151,7 @@ module_exit(vmci_transport_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); -MODULE_VERSION("1.0.2.0-k"); +MODULE_VERSION("1.0.4.0-k"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("vmware_vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index da72ed32f..6c606120a 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -50,8 +50,8 @@ config CFG80211_DEVELOPER_WARNINGS default n help This option enables some additional warnings that help - cfg80211 developers and driver developers, but that can - trigger due to races with userspace. + cfg80211 developers and driver developers, but beware that + they can also trigger due to races with userspace. For example, when a driver reports that it was disconnected from the AP, but the user disconnects manually at the same @@ -61,19 +61,6 @@ config CFG80211_DEVELOPER_WARNINGS on it (or mac80211). -config CFG80211_REG_DEBUG - bool "cfg80211 regulatory debugging" - depends on CFG80211 - default n - ---help--- - You can enable this if you want to debug regulatory changes. - For more information on cfg80211 regulatory refer to the wireless - wiki: - - http://wireless.kernel.org/en/developers/Regulatory - - If unsure, say N. - config CFG80211_CERTIFICATION_ONUS bool "cfg80211 certification onus" depends on CFG80211 && EXPERT @@ -123,7 +110,7 @@ config CFG80211_REG_RELAX_NO_IR interface which associated to an AP which userspace assumes or confirms to be an authorized master, i.e., with radar detection support and DFS capabilities. However, note that in order to not create daisy chain - scenarios, this relaxation is not allowed in cases that the BSS client + scenarios, this relaxation is not allowed in cases where the BSS client is associated to P2P GO and in addition the P2P GO instantiated on a channel due to this relaxation should not allow connection from non P2P clients. @@ -148,7 +135,7 @@ config CFG80211_DEBUGFS depends on CFG80211 depends on DEBUG_FS ---help--- - You can enable this if you want to debugfs entries for cfg80211. + You can enable this if you want debugfs entries for cfg80211. If unsure, say N. @@ -159,7 +146,7 @@ config CFG80211_INTERNAL_REGDB ---help--- This option generates an internal data structure representing the wireless regulatory rules described in net/wireless/db.txt - and includes code to query that database. This is an alternative + and includes code to query that database. This is an alternative to using CRDA for defining regulatory rules for the kernel. Using this option requires some parsing of the db.txt at build time, @@ -172,7 +159,7 @@ config CFG80211_INTERNAL_REGDB http://wireless.kernel.org/en/developers/Regulatory - Most distributions have a CRDA package. So if unsure, say N. + Most distributions have a CRDA package. So if unsure, say N. config CFG80211_CRDA_SUPPORT bool "support CRDA" if CFG80211_INTERNAL_REGDB diff --git a/net/wireless/core.c b/net/wireless/core.c index 8f0bac7e0..9f1c4aa85 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -352,6 +352,16 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, WARN_ON(ops->add_station && !ops->del_station); WARN_ON(ops->add_mpath && !ops->del_mpath); WARN_ON(ops->join_mesh && !ops->leave_mesh); + WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device); + WARN_ON(ops->start_ap && !ops->stop_ap); + WARN_ON(ops->join_ocb && !ops->leave_ocb); + WARN_ON(ops->suspend && !ops->resume); + WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop); + WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel); + WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch); + WARN_ON(ops->add_tx_ts && !ops->del_tx_ts); + WARN_ON(ops->set_tx_power && !ops->get_tx_power); + WARN_ON(ops->set_antenna && !ops->get_antenna); alloc_size = sizeof(*rdev) + sizeof_priv; diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index 3cd819539..71447cf86 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -29,7 +29,8 @@ #include <linux/ieee80211.h> #include <net/iw_handler.h> -#include <linux/crypto.h> +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/crc32.h> #include <net/lib80211.h> @@ -63,10 +64,10 @@ struct lib80211_tkip_data { int key_idx; - struct crypto_blkcipher *rx_tfm_arc4; - struct crypto_hash *rx_tfm_michael; - struct crypto_blkcipher *tx_tfm_arc4; - struct crypto_hash *tx_tfm_michael; + struct crypto_skcipher *rx_tfm_arc4; + struct crypto_ahash *rx_tfm_michael; + struct crypto_skcipher *tx_tfm_arc4; + struct crypto_ahash *tx_tfm_michael; /* scratch buffers for virt_to_page() (crypto API) */ u8 rx_hdr[16], tx_hdr[16]; @@ -98,29 +99,29 @@ static void *lib80211_tkip_init(int key_idx) priv->key_idx = key_idx; - priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, - CRYPTO_ALG_ASYNC); + priv->tx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm_arc4)) { priv->tx_tfm_arc4 = NULL; goto fail; } - priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm_michael)) { priv->tx_tfm_michael = NULL; goto fail; } - priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, - CRYPTO_ALG_ASYNC); + priv->rx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm_arc4)) { priv->rx_tfm_arc4 = NULL; goto fail; } - priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, + CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm_michael)) { priv->rx_tfm_michael = NULL; goto fail; @@ -130,14 +131,10 @@ static void *lib80211_tkip_init(int key_idx) fail: if (priv) { - if (priv->tx_tfm_michael) - crypto_free_hash(priv->tx_tfm_michael); - if (priv->tx_tfm_arc4) - crypto_free_blkcipher(priv->tx_tfm_arc4); - if (priv->rx_tfm_michael) - crypto_free_hash(priv->rx_tfm_michael); - if (priv->rx_tfm_arc4) - crypto_free_blkcipher(priv->rx_tfm_arc4); + crypto_free_ahash(priv->tx_tfm_michael); + crypto_free_skcipher(priv->tx_tfm_arc4); + crypto_free_ahash(priv->rx_tfm_michael); + crypto_free_skcipher(priv->rx_tfm_arc4); kfree(priv); } @@ -148,14 +145,10 @@ static void lib80211_tkip_deinit(void *priv) { struct lib80211_tkip_data *_priv = priv; if (_priv) { - if (_priv->tx_tfm_michael) - crypto_free_hash(_priv->tx_tfm_michael); - if (_priv->tx_tfm_arc4) - crypto_free_blkcipher(_priv->tx_tfm_arc4); - if (_priv->rx_tfm_michael) - crypto_free_hash(_priv->rx_tfm_michael); - if (_priv->rx_tfm_arc4) - crypto_free_blkcipher(_priv->rx_tfm_arc4); + crypto_free_ahash(_priv->tx_tfm_michael); + crypto_free_skcipher(_priv->tx_tfm_arc4); + crypto_free_ahash(_priv->rx_tfm_michael); + crypto_free_skcipher(_priv->rx_tfm_arc4); } kfree(priv); } @@ -353,11 +346,12 @@ static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len, static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_tkip_data *tkey = priv; - struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 }; + SKCIPHER_REQUEST_ON_STACK(req, tkey->tx_tfm_arc4); int len; u8 rc4key[16], *pos, *icv; u32 crc; struct scatterlist sg; + int err; if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; @@ -382,9 +376,14 @@ static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) icv[2] = crc >> 16; icv[3] = crc >> 24; - crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16); + crypto_skcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16); sg_init_one(&sg, pos, len + 4); - return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4); + skcipher_request_set_tfm(req, tkey->tx_tfm_arc4); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL); + err = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); + return err; } /* @@ -403,7 +402,7 @@ static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_tkip_data *tkey = priv; - struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 }; + SKCIPHER_REQUEST_ON_STACK(req, tkey->rx_tfm_arc4); u8 rc4key[16]; u8 keyidx, *pos; u32 iv32; @@ -413,6 +412,7 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) u32 crc; struct scatterlist sg; int plen; + int err; hdr = (struct ieee80211_hdr *)skb->data; @@ -465,9 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) plen = skb->len - hdr_len - 12; - crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16); + crypto_skcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16); sg_init_one(&sg, pos, plen + 4); - if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) { + skcipher_request_set_tfm(req, tkey->rx_tfm_arc4); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL); + err = crypto_skcipher_decrypt(req); + skcipher_request_zero(req); + if (err) { net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n", hdr->addr2); return -7; @@ -505,11 +510,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) return keyidx; } -static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr, +static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr, u8 * data, size_t data_len, u8 * mic) { - struct hash_desc desc; + AHASH_REQUEST_ON_STACK(req, tfm_michael); struct scatterlist sg[2]; + int err; if (tfm_michael == NULL) { pr_warn("%s(): tfm_michael == NULL\n", __func__); @@ -519,12 +525,15 @@ static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr, sg_set_buf(&sg[0], hdr, 16); sg_set_buf(&sg[1], data, data_len); - if (crypto_hash_setkey(tfm_michael, key, 8)) + if (crypto_ahash_setkey(tfm_michael, key, 8)) return -1; - desc.tfm = tfm_michael; - desc.flags = 0; - return crypto_hash_digest(&desc, sg, data_len + 16, mic); + ahash_request_set_tfm(req, tfm_michael); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, sg, mic, data_len + 16); + err = crypto_ahash_digest(req); + ahash_request_zero(req); + return err; } static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) @@ -645,10 +654,10 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) { struct lib80211_tkip_data *tkey = priv; int keyidx; - struct crypto_hash *tfm = tkey->tx_tfm_michael; - struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4; - struct crypto_hash *tfm3 = tkey->rx_tfm_michael; - struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4; + struct crypto_ahash *tfm = tkey->tx_tfm_michael; + struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4; + struct crypto_ahash *tfm3 = tkey->rx_tfm_michael; + struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4; keyidx = tkey->key_idx; memset(tkey, 0, sizeof(*tkey)); diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c index 1c292e4ea..d05f58b0f 100644 --- a/net/wireless/lib80211_crypt_wep.c +++ b/net/wireless/lib80211_crypt_wep.c @@ -22,7 +22,7 @@ #include <net/lib80211.h> -#include <linux/crypto.h> +#include <crypto/skcipher.h> #include <linux/crc32.h> MODULE_AUTHOR("Jouni Malinen"); @@ -35,8 +35,8 @@ struct lib80211_wep_data { u8 key[WEP_KEY_LEN + 1]; u8 key_len; u8 key_idx; - struct crypto_blkcipher *tx_tfm; - struct crypto_blkcipher *rx_tfm; + struct crypto_skcipher *tx_tfm; + struct crypto_skcipher *rx_tfm; }; static void *lib80211_wep_init(int keyidx) @@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx) goto fail; priv->key_idx = keyidx; - priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + priv->tx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm)) { priv->tx_tfm = NULL; goto fail; } - priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + priv->rx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm)) { priv->rx_tfm = NULL; goto fail; @@ -66,10 +66,8 @@ static void *lib80211_wep_init(int keyidx) fail: if (priv) { - if (priv->tx_tfm) - crypto_free_blkcipher(priv->tx_tfm); - if (priv->rx_tfm) - crypto_free_blkcipher(priv->rx_tfm); + crypto_free_skcipher(priv->tx_tfm); + crypto_free_skcipher(priv->rx_tfm); kfree(priv); } return NULL; @@ -79,10 +77,8 @@ static void lib80211_wep_deinit(void *priv) { struct lib80211_wep_data *_priv = priv; if (_priv) { - if (_priv->tx_tfm) - crypto_free_blkcipher(_priv->tx_tfm); - if (_priv->rx_tfm) - crypto_free_blkcipher(_priv->rx_tfm); + crypto_free_skcipher(_priv->tx_tfm); + crypto_free_skcipher(_priv->rx_tfm); } kfree(priv); } @@ -133,11 +129,12 @@ static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len, static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_wep_data *wep = priv; - struct blkcipher_desc desc = { .tfm = wep->tx_tfm }; + SKCIPHER_REQUEST_ON_STACK(req, wep->tx_tfm); u32 crc, klen, len; u8 *pos, *icv; struct scatterlist sg; u8 key[WEP_KEY_LEN + 3]; + int err; /* other checks are in lib80211_wep_build_iv */ if (skb_tailroom(skb) < 4) @@ -165,9 +162,14 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) icv[2] = crc >> 16; icv[3] = crc >> 24; - crypto_blkcipher_setkey(wep->tx_tfm, key, klen); + crypto_skcipher_setkey(wep->tx_tfm, key, klen); sg_init_one(&sg, pos, len + 4); - return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4); + skcipher_request_set_tfm(req, wep->tx_tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL); + err = crypto_skcipher_encrypt(req); + skcipher_request_zero(req); + return err; } /* Perform WEP decryption on given buffer. Buffer includes whole WEP part of @@ -180,11 +182,12 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_wep_data *wep = priv; - struct blkcipher_desc desc = { .tfm = wep->rx_tfm }; + SKCIPHER_REQUEST_ON_STACK(req, wep->rx_tfm); u32 crc, klen, plen; u8 key[WEP_KEY_LEN + 3]; u8 keyidx, *pos, icv[4]; struct scatterlist sg; + int err; if (skb->len < hdr_len + 8) return -1; @@ -205,9 +208,14 @@ static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) /* Apply RC4 to data and compute CRC32 over decrypted data */ plen = skb->len - hdr_len - 8; - crypto_blkcipher_setkey(wep->rx_tfm, key, klen); + crypto_skcipher_setkey(wep->rx_tfm, key, klen); sg_init_one(&sg, pos, plen + 4); - if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) + skcipher_request_set_tfm(req, wep->rx_tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL); + err = crypto_skcipher_decrypt(req); + skcipher_request_zero(req); + if (err) return -7; crc = ~crc32_le(~0, pos, plen); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index fb44fa3bf..ff328250b 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -711,7 +711,7 @@ EXPORT_SYMBOL(cfg80211_rx_mgmt); void cfg80211_dfs_channels_update_work(struct work_struct *work) { - struct delayed_work *delayed_work; + struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; struct cfg80211_chan_def chandef; struct ieee80211_supported_band *sband; @@ -721,7 +721,6 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) unsigned long timeout, next_time = 0; int bandid, i; - delayed_work = container_of(work, struct delayed_work, work); rdev = container_of(delayed_work, struct cfg80211_registered_device, dfs_update_channels_wk); wiphy = &rdev->wiphy; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ab62d305b..056a73078 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2016 Intel Deutschland GmbH */ #include <linux/if.h> @@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, + [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) + return -EOPNOTSUPP; + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -7281,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) } if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { - if (!(rdev->wiphy.features & - NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || - !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + if (!((rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) && + (rdev->wiphy.features & NL80211_FEATURE_QUIET)) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_RRM)) return -EINVAL; req.flags |= ASSOC_REQ_USE_RRM; } @@ -7971,15 +7978,23 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { - if (!(rdev->wiphy.features & - NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || - !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) { + if (!((rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) && + (rdev->wiphy.features & NL80211_FEATURE_QUIET)) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_RRM)) { kzfree(connkeys); return -EINVAL; } connect.flags |= ASSOC_REQ_USE_RRM; } + connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) { + kzfree(connkeys); + return -EOPNOTSUPP; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 722da6164..6582d155e 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -43,6 +43,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = { [IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, }, [IEEE80211_RADIOTAP_MCS] = { .align = 1, .size = 3, }, [IEEE80211_RADIOTAP_AMPDU_STATUS] = { .align = 4, .size = 8, }, + [IEEE80211_RADIOTAP_VHT] = { .align = 2, .size = 12, }, /* * add more here as they are defined in radiotap.h */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 547ceecc0..c5fb317ee 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -60,13 +60,6 @@ #include "regdb.h" #include "nl80211.h" -#ifdef CONFIG_CFG80211_REG_DEBUG -#define REG_DBG_PRINT(format, args...) \ - printk(KERN_DEBUG pr_fmt(format), ##args) -#else -#define REG_DBG_PRINT(args...) -#endif - /* * Grace period we give before making sure all current interfaces reside on * channels allowed by the current regulatory domain. @@ -178,12 +171,10 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) if (wiphy_regd->dfs_region == regd->dfs_region) goto out; - REG_DBG_PRINT("%s: device specific dfs_region " - "(%s) disagrees with cfg80211's " - "central dfs_region (%s)\n", - dev_name(&wiphy->dev), - reg_dfs_region_str(wiphy_regd->dfs_region), - reg_dfs_region_str(regd->dfs_region)); + pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n", + dev_name(&wiphy->dev), + reg_dfs_region_str(wiphy_regd->dfs_region), + reg_dfs_region_str(regd->dfs_region)); out: return regd->dfs_region; @@ -543,7 +534,7 @@ static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work); static void crda_timeout_work(struct work_struct *work) { - REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); + pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); rtnl_lock(); reg_crda_timeouts++; restore_regulatory_settings(true); @@ -585,7 +576,7 @@ static int call_crda(const char *alpha2) if (!is_world_regdom((char *) alpha2)) pr_debug("Calling CRDA for country: %c%c\n", - alpha2[0], alpha2[1]); + alpha2[0], alpha2[1]); else pr_debug("Calling CRDA to update world regulatory domain\n"); @@ -1132,42 +1123,6 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator) } EXPORT_SYMBOL(reg_initiator_name); -static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, - struct ieee80211_channel *chan, - const struct ieee80211_reg_rule *reg_rule) -{ -#ifdef CONFIG_CFG80211_REG_DEBUG - const struct ieee80211_power_rule *power_rule; - const struct ieee80211_freq_range *freq_range; - char max_antenna_gain[32], bw[32]; - - power_rule = ®_rule->power_rule; - freq_range = ®_rule->freq_range; - - if (!power_rule->max_antenna_gain) - snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A"); - else - snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d mBi", - power_rule->max_antenna_gain); - - if (reg_rule->flags & NL80211_RRF_AUTO_BW) - snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO", - freq_range->max_bandwidth_khz, - reg_get_max_bandwidth(regd, reg_rule)); - else - snprintf(bw, sizeof(bw), "%d KHz", - freq_range->max_bandwidth_khz); - - REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n", - chan->center_freq); - - REG_DBG_PRINT("(%d KHz - %d KHz @ %s), (%s, %d mBm)\n", - freq_range->start_freq_khz, freq_range->end_freq_khz, - bw, max_antenna_gain, - power_rule->max_eirp); -#endif -} - static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd, const struct ieee80211_reg_rule *reg_rule, const struct ieee80211_channel *chan) @@ -1242,20 +1197,19 @@ static void handle_channel(struct wiphy *wiphy, if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - REG_DBG_PRINT("Disabling freq %d MHz for good\n", - chan->center_freq); + pr_debug("Disabling freq %d MHz for good\n", + chan->center_freq); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { - REG_DBG_PRINT("Disabling freq %d MHz\n", - chan->center_freq); + pr_debug("Disabling freq %d MHz\n", + chan->center_freq); chan->flags |= IEEE80211_CHAN_DISABLED; } return; } regd = reg_get_regdomain(wiphy); - chan_reg_rule_print_dbg(regd, chan, reg_rule); power_rule = ®_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); @@ -1393,18 +1347,15 @@ static bool ignore_reg_update(struct wiphy *wiphy, return true; if (!lr) { - REG_DBG_PRINT("Ignoring regulatory request set by %s " - "since last_request is not set\n", - reg_initiator_name(initiator)); + pr_debug("Ignoring regulatory request set by %s since last_request is not set\n", + reg_initiator_name(initiator)); return true; } if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { - REG_DBG_PRINT("Ignoring regulatory request set by %s " - "since the driver uses its own custom " - "regulatory domain\n", - reg_initiator_name(initiator)); + pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n", + reg_initiator_name(initiator)); return true; } @@ -1415,10 +1366,8 @@ static bool ignore_reg_update(struct wiphy *wiphy, if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd && initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && !is_world_regdom(lr->alpha2)) { - REG_DBG_PRINT("Ignoring regulatory request set by %s " - "since the driver requires its own regulatory " - "domain to be set first\n", - reg_initiator_name(initiator)); + pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n", + reg_initiator_name(initiator)); return true; } @@ -1699,7 +1648,7 @@ static void reg_check_chans_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; - REG_DBG_PRINT("Verifying active interfaces after reg change\n"); + pr_debug("Verifying active interfaces after reg change\n"); rtnl_lock(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) @@ -1781,8 +1730,8 @@ static void handle_channel_custom(struct wiphy *wiphy, } if (IS_ERR(reg_rule)) { - REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n", - chan->center_freq); + pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", + chan->center_freq); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { @@ -1792,8 +1741,6 @@ static void handle_channel_custom(struct wiphy *wiphy, return; } - chan_reg_rule_print_dbg(regd, chan, reg_rule); - power_rule = ®_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); @@ -2524,7 +2471,7 @@ static void restore_alpha2(char *alpha2, bool reset_user) if (is_user_regdom_saved()) { /* Unless we're asked to ignore it and reset it */ if (reset_user) { - REG_DBG_PRINT("Restoring regulatory settings including user preference\n"); + pr_debug("Restoring regulatory settings including user preference\n"); user_alpha2[0] = '9'; user_alpha2[1] = '7'; @@ -2534,24 +2481,24 @@ static void restore_alpha2(char *alpha2, bool reset_user) * back as they were for a full restore. */ if (!is_world_regdom(ieee80211_regdom)) { - REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n", - ieee80211_regdom[0], ieee80211_regdom[1]); + pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } } else { - REG_DBG_PRINT("Restoring regulatory settings while preserving user preference for: %c%c\n", - user_alpha2[0], user_alpha2[1]); + pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n", + user_alpha2[0], user_alpha2[1]); alpha2[0] = user_alpha2[0]; alpha2[1] = user_alpha2[1]; } } else if (!is_world_regdom(ieee80211_regdom)) { - REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n", - ieee80211_regdom[0], ieee80211_regdom[1]); + pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } else - REG_DBG_PRINT("Restoring regulatory settings\n"); + pr_debug("Restoring regulatory settings\n"); } static void restore_custom_reg_settings(struct wiphy *wiphy) @@ -2663,14 +2610,14 @@ static void restore_regulatory_settings(bool reset_user) list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); spin_unlock(®_requests_lock); - REG_DBG_PRINT("Kicking the queue\n"); + pr_debug("Kicking the queue\n"); schedule_work(®_work); } void regulatory_hint_disconnect(void) { - REG_DBG_PRINT("All devices are disconnected, going to restore regulatory settings\n"); + pr_debug("All devices are disconnected, going to restore regulatory settings\n"); restore_regulatory_settings(false); } @@ -2718,10 +2665,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (!reg_beacon) return -ENOMEM; - REG_DBG_PRINT("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", - beacon_chan->center_freq, - ieee80211_frequency_to_channel(beacon_chan->center_freq), - wiphy_name(wiphy)); + pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, + ieee80211_frequency_to_channel(beacon_chan->center_freq), + wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, sizeof(struct ieee80211_channel)); @@ -2800,8 +2747,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region) case NL80211_DFS_JP: return true; default: - REG_DBG_PRINT("Ignoring uknown DFS master region: %d\n", - dfs_region); + pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region); return false; } } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d49ed7666..544558171 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, - IEEE80211_BSS_TYPE_ESS, + wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; @@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, wdev->ssid, wdev->ssid_len, - IEEE80211_BSS_TYPE_ESS, + wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (bss) cfg80211_hold_bss(bss_from_pub(bss)); @@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev, bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid, wdev->ssid_len, - IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); + wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; @@ -1023,6 +1023,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, memcpy(wdev->ssid, connect->ssid, connect->ssid_len); wdev->ssid_len = connect->ssid_len; + wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : + IEEE80211_BSS_TYPE_ESS; + if (!rdev->ops->connect) err = cfg80211_sme_connect(wdev, connect, prev_bssid); else diff --git a/net/wireless/util.c b/net/wireless/util.c index 92770427b..9f440a9de 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -393,9 +393,9 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); -unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) +static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { - int ae = meshhdr->flags & MESH_FLAGS_AE; + int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: @@ -407,21 +407,31 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) return 18; } } + +unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) +{ + return __ieee80211_get_mesh_hdrlen(meshhdr->flags); +} EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); -int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype) +static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr, + const u8 *addr, enum nl80211_iftype iftype) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - u16 hdrlen, ethertype; - u8 *payload; - u8 dst[ETH_ALEN]; - u8 src[ETH_ALEN] __aligned(2); + struct { + u8 hdr[ETH_ALEN] __aligned(2); + __be16 proto; + } payload; + struct ethhdr tmp; + u16 hdrlen; + u8 mesh_flags = 0; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (skb->len < hdrlen + 8) + return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header @@ -432,8 +442,11 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ - memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN); - memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN); + memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); + memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); + + if (iftype == NL80211_IFTYPE_MESH_POINT) + skb_copy_bits(skb, hdrlen, &mesh_flags, 1); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { @@ -450,44 +463,31 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, iftype != NL80211_IFTYPE_STATION)) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - struct ieee80211s_hdr *meshdr = - (struct ieee80211s_hdr *) (skb->data + hdrlen); - /* make sure meshdr->flags is on the linear part */ - if (!pskb_may_pull(skb, hdrlen + 1)) - return -1; - if (meshdr->flags & MESH_FLAGS_AE_A4) + if (mesh_flags & MESH_FLAGS_AE_A4) return -1; - if (meshdr->flags & MESH_FLAGS_AE_A5_A6) { + if (mesh_flags & MESH_FLAGS_AE_A5_A6) { skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), - dst, ETH_ALEN); - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr2), - src, ETH_ALEN); + tmp.h_dest, 2 * ETH_ALEN); } - hdrlen += ieee80211_get_mesh_hdrlen(meshdr); + hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || - (is_multicast_ether_addr(dst) && - ether_addr_equal(src, addr))) + (is_multicast_ether_addr(tmp.h_dest) && + ether_addr_equal(tmp.h_source, addr))) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - struct ieee80211s_hdr *meshdr = - (struct ieee80211s_hdr *) (skb->data + hdrlen); - /* make sure meshdr->flags is on the linear part */ - if (!pskb_may_pull(skb, hdrlen + 1)) - return -1; - if (meshdr->flags & MESH_FLAGS_AE_A5_A6) + if (mesh_flags & MESH_FLAGS_AE_A5_A6) return -1; - if (meshdr->flags & MESH_FLAGS_AE_A4) + if (mesh_flags & MESH_FLAGS_AE_A4) skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), - src, ETH_ALEN); - hdrlen += ieee80211_get_mesh_hdrlen(meshdr); + tmp.h_source, ETH_ALEN); + hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; case cpu_to_le16(0): @@ -498,33 +498,33 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, break; } - if (!pskb_may_pull(skb, hdrlen + 8)) - return -1; - - payload = skb->data + hdrlen; - ethertype = (payload[6] << 8) | payload[7]; + skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)); + tmp.h_proto = payload.proto; - if (likely((ether_addr_equal(payload, rfc1042_header) && - ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || - ether_addr_equal(payload, bridge_tunnel_header))) { + if (likely((ether_addr_equal(payload.hdr, rfc1042_header) && + tmp.h_proto != htons(ETH_P_AARP) && + tmp.h_proto != htons(ETH_P_IPX)) || + ether_addr_equal(payload.hdr, bridge_tunnel_header))) /* remove RFC1042 or Bridge-Tunnel encapsulation and * replace EtherType */ - skb_pull(skb, hdrlen + 6); - memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN); - memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN); - } else { - struct ethhdr *ehdr; - __be16 len; + hdrlen += ETH_ALEN + 2; + else + tmp.h_proto = htons(skb->len); - skb_pull(skb, hdrlen); - len = htons(skb->len); + pskb_pull(skb, hdrlen); + + if (!ehdr) ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr)); - memcpy(ehdr->h_dest, dst, ETH_ALEN); - memcpy(ehdr->h_source, src, ETH_ALEN); - ehdr->h_proto = len; - } + memcpy(ehdr, &tmp, sizeof(tmp)); + return 0; } + +int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype) +{ + return __ieee80211_data_to_8023(skb, NULL, addr, iftype); +} EXPORT_SYMBOL(ieee80211_data_to_8023); int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, @@ -636,7 +636,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, /* Update skb pointers to various headers since this modified frame * is going to go through Linux networking code that may potentially * need things like pointer to IP header. */ - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, nh_pos); skb_set_transport_header(skb, h_pos); @@ -644,70 +644,147 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, } EXPORT_SYMBOL(ieee80211_data_from_8023); +static void +__frame_add_frag(struct sk_buff *skb, struct page *page, + void *ptr, int len, int size) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + int page_offset; + + atomic_inc(&page->_count); + page_offset = ptr - page_address(page); + skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); +} + +static void +__ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, + int offset, int len) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + const skb_frag_t *frag = &sh->frags[-1]; + struct page *frag_page; + void *frag_ptr; + int frag_len, frag_size; + int head_size = skb->len - skb->data_len; + int cur_len; + + frag_page = virt_to_head_page(skb->head); + frag_ptr = skb->data; + frag_size = head_size; + + while (offset >= frag_size) { + offset -= frag_size; + frag++; + frag_page = skb_frag_page(frag); + frag_ptr = skb_frag_address(frag); + frag_size = skb_frag_size(frag); + } + + frag_ptr += offset; + frag_len = frag_size - offset; + + cur_len = min(len, frag_len); + + __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); + len -= cur_len; + + while (len > 0) { + frag++; + frag_len = skb_frag_size(frag); + cur_len = min(len, frag_len); + __frame_add_frag(frame, skb_frag_page(frag), + skb_frag_address(frag), cur_len, frag_len); + len -= cur_len; + } +} + +static struct sk_buff * +__ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, + int offset, int len, bool reuse_frag) +{ + struct sk_buff *frame; + int cur_len = len; + + if (skb->len - offset < len) + return NULL; + + /* + * When reusing framents, copy some data to the head to simplify + * ethernet header handling and speed up protocol header processing + * in the stack later. + */ + if (reuse_frag) + cur_len = min_t(int, len, 32); + + /* + * Allocate and reserve two bytes more for payload + * alignment since sizeof(struct ethhdr) is 14. + */ + frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); + + skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); + skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); + + len -= cur_len; + if (!len) + return frame; + + offset += cur_len; + __ieee80211_amsdu_copy_frag(skb, frame, offset, len); + + return frame; +} void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, bool has_80211_header) { + unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; u16 ethertype; u8 *payload; - const struct ethhdr *eth; - int remaining, err; - u8 dst[ETH_ALEN], src[ETH_ALEN]; + int offset = 0, remaining, err; + struct ethhdr eth; + bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); + bool reuse_skb = false; + bool last = false; if (has_80211_header) { - err = ieee80211_data_to_8023(skb, addr, iftype); + err = __ieee80211_data_to_8023(skb, ð, addr, iftype); if (err) goto out; - - /* skip the wrapping header */ - eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr)); - if (!eth) - goto out; - } else { - eth = (struct ethhdr *) skb->data; } - while (skb != frame) { + while (!last) { + unsigned int subframe_len; + int len; u8 padding; - __be16 len = eth->h_proto; - unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len); - - remaining = skb->len; - memcpy(dst, eth->h_dest, ETH_ALEN); - memcpy(src, eth->h_source, ETH_ALEN); + skb_copy_bits(skb, offset, ð, sizeof(eth)); + len = ntohs(eth.h_proto); + subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; + /* the last MSDU has no padding */ + remaining = skb->len - offset; if (subframe_len > remaining) goto purge; - skb_pull(skb, sizeof(struct ethhdr)); + offset += sizeof(struct ethhdr); /* reuse skb for the last subframe */ - if (remaining <= subframe_len + padding) + last = remaining <= subframe_len + padding; + if (!skb_is_nonlinear(skb) && !reuse_frag && last) { + skb_pull(skb, offset); frame = skb; - else { - unsigned int hlen = ALIGN(extra_headroom, 4); - /* - * Allocate and reserve two bytes more for payload - * alignment since sizeof(struct ethhdr) is 14. - */ - frame = dev_alloc_skb(hlen + subframe_len + 2); + reuse_skb = true; + } else { + frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, + reuse_frag); if (!frame) goto purge; - skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); - memcpy(skb_put(frame, ntohs(len)), skb->data, - ntohs(len)); - - eth = (struct ethhdr *)skb_pull(skb, ntohs(len) + - padding); - if (!eth) { - dev_kfree_skb(frame); - goto purge; - } + offset += len + padding; } skb_reset_network_header(frame); @@ -716,24 +793,20 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, payload = frame->data; ethertype = (payload[6] << 8) | payload[7]; - if (likely((ether_addr_equal(payload, rfc1042_header) && ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || ether_addr_equal(payload, bridge_tunnel_header))) { - /* remove RFC1042 or Bridge-Tunnel - * encapsulation and replace EtherType */ - skb_pull(frame, 6); - memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); - memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); - } else { - memcpy(skb_push(frame, sizeof(__be16)), &len, - sizeof(__be16)); - memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); - memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); + eth.h_proto = htons(ethertype); + skb_pull(frame, ETH_ALEN + 2); } + + memcpy(skb_push(frame, sizeof(eth)), ð, sizeof(eth)); __skb_queue_tail(list, frame); } + if (!reuse_skb) + dev_kfree_skb(skb); + return; purge: diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 7ecd04c21..997ff7b25 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -277,6 +277,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, memset(&theirs, 0, sizeof(theirs)); memcpy(new, ours, sizeof(*new)); + memset(dte, 0, sizeof(*dte)); len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask); if (len < 0) diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index f07224d8b..250e567ba 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -9,6 +9,8 @@ * any later version. */ +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/pfkeyv2.h> @@ -782,14 +784,13 @@ void xfrm_probe_algs(void) BUG_ON(in_softirq()); for (i = 0; i < aalg_entries(); i++) { - status = crypto_has_hash(aalg_list[i].name, 0, - CRYPTO_ALG_ASYNC); + status = crypto_has_ahash(aalg_list[i].name, 0, 0); if (aalg_list[i].available != status) aalg_list[i].available = status; } for (i = 0; i < ealg_entries(); i++) { - status = crypto_has_ablkcipher(ealg_list[i].name, 0, 0); + status = crypto_has_skcipher(ealg_list[i].name, 0, 0); if (ealg_list[i].available != status) ealg_list[i].available = status; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index ff4a91fca..637387bba 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -99,6 +99,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err) skb_dst_force(skb); + /* Inner headers are invalid now. */ + skb->encapsulation = 0; + err = x->type->output(x, skb); if (err == -EINPROGRESS) goto out; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 805681a7d..2cc7af858 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2449,7 +2449,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int type, err; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) return -ENOTSUPP; #endif |