summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-06-10 05:30:17 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-06-10 05:30:17 -0300
commitd635711daa98be86d4c7fd01499c34f566b54ccb (patch)
treeaa5cc3760a27c3d57146498cb82fa549547de06c /net/core
parentc91265cd0efb83778f015b4d4b1129bd2cfd075e (diff)
Linux-libre 4.6.2-gnu
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c40
-rw-r--r--net/core/devlink.c738
-rw-r--r--net/core/dst.c10
-rw-r--r--net/core/dst_cache.c168
-rw-r--r--net/core/ethtool.c638
-rw-r--r--net/core/filter.c257
-rw-r--r--net/core/flow.c14
-rw-r--r--net/core/flow_dissector.c58
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gen_stats.c1
-rw-r--r--net/core/hwbm.c87
-rw-r--r--net/core/lwtunnel.c37
-rw-r--r--net/core/net-sysfs.c18
-rw-r--r--net/core/netclassid_cgroup.c1
-rw-r--r--net/core/netprio_cgroup.c1
-rw-r--r--net/core/pktgen.c4
-rw-r--r--net/core/rtnetlink.c137
-rw-r--r--net/core/skbuff.c172
-rw-r--r--net/core/sock.c16
20 files changed, 2205 insertions, 197 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0b835de04..d6508c2dd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,3 +24,6 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ef061b2b..5c925ac50 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2802,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
- features &= ~NETIF_F_CSUM_MASK;
+ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
@@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h)
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
- __kfree_skb(skb);
+
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
}
+
+ __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -4154,7 +4160,10 @@ ncls:
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
drop:
- atomic_long_inc(&skb->dev->rx_dropped);
+ if (!deliver_exact)
+ atomic_long_inc(&skb->dev->rx_dropped);
+ else
+ atomic_long_inc(&skb->dev->rx_nohandler);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
@@ -4429,7 +4438,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->udp_mark = 0;
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
@@ -5152,6 +5162,7 @@ static void net_rx_action(struct softirq_action *h)
}
}
+ __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -6435,6 +6446,7 @@ EXPORT_SYMBOL(dev_get_phys_port_id);
* dev_get_phys_port_name - Get device physical port name
* @dev: device
* @name: port name
+ * @len: limit of bytes to copy to name
*
* Get device physical port name
*/
@@ -7253,24 +7265,31 @@ void netdev_run_todo(void)
}
}
-/* Convert net_device_stats to rtnl_link_stats64. They have the same
- * fields in the same order, with only the type differing.
+/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
+ * all the same fields in the same order as net_device_stats, with only
+ * the type differing, but rtnl_link_stats64 may have additional fields
+ * at the end for newer counters.
*/
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
#if BITS_PER_LONG == 64
- BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
memcpy(stats64, netdev_stats, sizeof(*stats64));
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + sizeof(*netdev_stats), 0,
+ sizeof(*stats64) - sizeof(*netdev_stats));
#else
- size_t i, n = sizeof(*stats64) / sizeof(u64);
+ size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
const unsigned long *src = (const unsigned long *)netdev_stats;
u64 *dst = (u64 *)stats64;
- BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
- sizeof(*stats64) / sizeof(u64));
+ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
dst[i] = src[i];
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + n * sizeof(u64), 0,
+ sizeof(*stats64) - n * sizeof(u64));
#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
@@ -7300,6 +7319,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000..590fa561c
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,738 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+ return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+ write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+ struct nlattr **attrs)
+{
+ struct devlink *devlink;
+ char *busname;
+ char *devname;
+
+ if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+
+ busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+ devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+ strcmp(dev_name(devlink->dev), devname) == 0 &&
+ net_eq(devlink_net(devlink), net))
+ return devlink;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+ return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ int port_index)
+{
+ struct devlink_port *devlink_port;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (devlink_port->index == port_index)
+ return devlink_port;
+ }
+ return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+ return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return ERR_PTR(-ENODEV);
+ return devlink_port;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_info(info);
+ if (IS_ERR(devlink)) {
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink);
+ }
+ info->user_ptr[0] = devlink;
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ struct devlink_port *devlink_port;
+
+ mutex_lock(&devlink_port_mutex);
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (IS_ERR(devlink_port)) {
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink_port);
+ }
+ info->user_ptr[1] = devlink_port;
+ }
+ return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+ DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+ [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+ goto nla_put_failure;
+ if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+ devlink_port->desired_type))
+ goto nla_put_failure;
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ struct net_device *netdev = devlink_port->type_dev;
+
+ if (netdev &&
+ (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+ netdev->ifindex) ||
+ nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+ netdev->name)))
+ goto nla_put_failure;
+ }
+ if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+ struct ib_device *ibdev = devlink_port->type_dev;
+
+ if (ibdev &&
+ nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+ ibdev->name))
+ goto nla_put_failure;
+ }
+ if (devlink_port->split &&
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ devlink_port->split_group))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink_port->registered)
+ return;
+
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_port *devlink_port;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_port_type port_type)
+
+{
+ int err;
+
+ if (devlink->ops && devlink->ops->port_type_set) {
+ if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+ return -EINVAL;
+ err = devlink->ops->port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+ enum devlink_port_type port_type;
+
+ port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+ err = devlink_port_type_set(devlink, devlink_port, port_type);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+ u32 port_index, u32 count)
+
+{
+ if (devlink->ops && devlink->ops->port_split)
+ return devlink->ops->port_split(devlink, port_index, count);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+ u32 count;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+ !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+ return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+ if (devlink->ops && devlink->ops->port_unsplit)
+ return devlink->ops->port_unsplit(devlink, port_index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ return devlink_port_unsplit(devlink, port_index);
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .doit = devlink_nl_cmd_get_doit,
+ .dumpit = devlink_nl_cmd_get_dumpit,
+ .policy = devlink_nl_policy,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .doit = devlink_nl_cmd_port_get_doit,
+ .dumpit = devlink_nl_cmd_port_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SET,
+ .doit = devlink_nl_cmd_port_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SPLIT,
+ .doit = devlink_nl_cmd_port_split_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_UNSPLIT,
+ .doit = devlink_nl_cmd_port_unsplit_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+/**
+ * devlink_alloc - Allocate new devlink instance resources
+ *
+ * @ops: ops
+ * @priv_size: size of user private data
+ *
+ * Allocate new devlink instance resources, including devlink index
+ * and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+ struct devlink *devlink;
+
+ devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+ if (!devlink)
+ return NULL;
+ devlink->ops = ops;
+ devlink_net_set(devlink, &init_net);
+ INIT_LIST_HEAD(&devlink->port_list);
+ return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ * devlink_register - Register devlink instance
+ *
+ * @devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+ mutex_lock(&devlink_mutex);
+ devlink->dev = dev;
+ list_add_tail(&devlink->list, &devlink_list);
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ mutex_unlock(&devlink_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ * devlink_unregister - Unregister devlink instance
+ *
+ * @devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+ mutex_lock(&devlink_mutex);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+ list_del(&devlink->list);
+ mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ * devlink_free - Free devlink instance resources
+ *
+ * @devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+ kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ * devlink_port_register - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index)
+{
+ mutex_lock(&devlink_port_mutex);
+ if (devlink_port_index_exists(devlink, port_index)) {
+ mutex_unlock(&devlink_port_mutex);
+ return -EEXIST;
+ }
+ devlink_port->devlink = devlink;
+ devlink_port->index = port_index;
+ devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
+ devlink_port->registered = true;
+ list_add_tail(&devlink_port->list, &devlink->port_list);
+ mutex_unlock(&devlink_port_mutex);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ * devlink_port_unregister - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ mutex_lock(&devlink_port_mutex);
+ list_del(&devlink_port->list);
+ mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ devlink_port->type = type;
+ devlink_port->type_dev = type_dev;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ * devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ * @devlink_port: devlink port
+ * @ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+ struct ib_device *ibdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ * devlink_port_type_clear - Clear port type
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ * devlink_port_split_set - Set port is split
+ *
+ * @devlink_port: devlink port
+ * @split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+ u32 split_group)
+{
+ devlink_port->split = true;
+ devlink_port->split_group = split_group;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+static int __init devlink_module_init(void)
+{
+ return genl_register_family_with_ops_groups(&devlink_nl_family,
+ devlink_nl_ops,
+ devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+ genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
diff --git a/net/core/dst.c b/net/core/dst.c
index a1656e3b8..b5cbbe07f 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -265,7 +265,7 @@ again:
lwtstate_put(dst->lwtstate);
if (dst->flags & DST_METADATA)
- kfree(dst);
+ metadata_dst_free((struct metadata_dst *)dst);
else
kmem_cache_free(dst->ops->kmem_cachep, dst);
@@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+ dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+ kfree(md_dst);
+}
+
struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
{
int cpu;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000..554d36449
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,168 @@
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+ unsigned long refresh_ts;
+ struct dst_entry *dst;
+ u32 cookie;
+ union {
+ struct in_addr in_saddr;
+ struct in6_addr in6_saddr;
+ };
+};
+
+static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+ struct dst_entry *dst, u32 cookie)
+{
+ dst_release(dst_cache->dst);
+ if (dst)
+ dst_hold(dst);
+
+ dst_cache->cookie = cookie;
+ dst_cache->dst = dst;
+}
+
+static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+ struct dst_cache_pcpu *idst)
+{
+ struct dst_entry *dst;
+
+ dst = idst->dst;
+ if (!dst)
+ goto fail;
+
+ /* the cache already hold a dst reference; it can't go away */
+ dst_hold(dst);
+
+ if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+ (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+ dst_cache_per_cpu_dst_set(idst, NULL, 0);
+ dst_release(dst);
+ goto fail;
+ }
+ return dst;
+
+fail:
+ idst->refresh_ts = jiffies;
+ return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+ if (!dst_cache->cache)
+ return NULL;
+
+ return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in_saddr.s_addr;
+ return container_of(dst, struct rtable, dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+ __be32 saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(idst, dst, 0);
+ idst->in_saddr.s_addr = saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+ const struct in6_addr *addr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
+ rt6_get_cookie((struct rt6_info *)dst));
+ idst->in6_saddr = *addr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+ struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in6_saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+ dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+ gfp | __GFP_ZERO);
+ if (!dst_cache->cache)
+ return -ENOMEM;
+
+ dst_cache_reset(dst_cache);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ for_each_possible_cpu(i)
+ dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+ free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index daf04709d..f426c5ad6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
};
static const char
@@ -386,43 +387,461 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
return 0;
}
-int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
{
+ bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ dst[0] = legacy_u32;
+}
+
+/* return false if src had higher bits set. lower bits always updated. */
+static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+ const unsigned long *src)
+{
+ bool retval = true;
+
+ /* TODO: following test will soon always be true */
+ if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+ bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_fill(ext, 32);
+ bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ if (bitmap_intersects(ext, src,
+ __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+ /* src mask goes beyond bit 31 */
+ retval = false;
+ }
+ }
+ *legacy_u32 = src[0];
+ return retval;
+}
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->transceiver ||
+ legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+ struct ethtool_cmd *legacy_settings,
+ const struct ethtool_link_ksettings *link_ksettings)
+{
+ bool retval = true;
+
+ memset(legacy_settings, 0, sizeof(*legacy_settings));
+ /* this also clears the deprecated fields in legacy structure:
+ * __u8 transceiver;
+ * __u32 maxtxpkt;
+ * __u32 maxrxpkt;
+ */
+
+ retval &= convert_link_mode_to_legacy_u32(
+ &legacy_settings->supported,
+ link_ksettings->link_modes.supported);
+ retval &= convert_link_mode_to_legacy_u32(
+ &legacy_settings->advertising,
+ link_ksettings->link_modes.advertising);
+ retval &= convert_link_mode_to_legacy_u32(
+ &legacy_settings->lp_advertising,
+ link_ksettings->link_modes.lp_advertising);
+ ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+ legacy_settings->duplex
+ = link_ksettings->base.duplex;
+ legacy_settings->port
+ = link_ksettings->base.port;
+ legacy_settings->phy_address
+ = link_ksettings->base.phy_address;
+ legacy_settings->autoneg
+ = link_ksettings->base.autoneg;
+ legacy_settings->mdio_support
+ = link_ksettings->base.mdio_support;
+ legacy_settings->eth_tp_mdix
+ = link_ksettings->base.eth_tp_mdix;
+ legacy_settings->eth_tp_mdix_ctrl
+ = link_ksettings->base.eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32 \
+ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+ struct ethtool_link_settings base;
+ struct {
+ __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ } link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *link_ksettings)
+{
+ int err;
+ struct ethtool_cmd cmd;
+
ASSERT_RTNL();
+ if (dev->ethtool_ops->get_link_ksettings) {
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev,
+ link_ksettings);
+ }
+
+ /* driver doesn't support %ethtool_link_ksettings API. revert to
+ * legacy %ethtool_cmd API, unless it's not supported either.
+ * TODO: remove when ethtool_ops::get_settings disappears internally
+ */
if (!dev->ethtool_ops->get_settings)
return -EOPNOTSUPP;
- memset(cmd, 0, sizeof(struct ethtool_cmd));
- cmd->cmd = ETHTOOL_GSET;
- return dev->ethtool_ops->get_settings(dev, cmd);
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+
+ /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+ */
+ convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+ return err;
}
-EXPORT_SYMBOL(__ethtool_get_settings);
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+ const void __user *from)
{
- int err;
- struct ethtool_cmd cmd;
+ struct ethtool_link_usettings link_usettings;
+
+ if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+ return -EFAULT;
+
+ memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+ bitmap_from_u32array(to->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+ bitmap_from_u32array(to->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+ bitmap_from_u32array(to->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+
+ return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+ const struct ethtool_link_ksettings *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+ bitmap_to_u32array(link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_u32array(link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_link_ksettings link_ksettings;
- err = __ethtool_get_settings(dev, &cmd);
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* handle bitmap nbits handshake */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords) {
+ /* wrong link mode nbits requested */
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ /* send back number of words required as negative val */
+ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+ "need too many bits for link modes!");
+ link_ksettings.base.link_mode_masks_nwords
+ = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+ /* copy the base fields back to user, not the link
+ * mode bitmaps
+ */
+ if (copy_to_user(useraddr, &link_ksettings.base,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ /* handshake successful: user/kernel agree on
+ * link_mode_masks_nwords
+ */
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
if (err < 0)
return err;
+ /* make sure we tell the right values to user */
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+
+ return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* make sure nbits field has expected value */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ /* copy the whole structure, now that we know it has expected
+ * format
+ */
+ err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+ if (err)
+ return err;
+
+ /* re-check nwords field, just in case */
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+ get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ ASSERT_RTNL();
+
+ if (dev->ethtool_ops->get_link_ksettings) {
+ /* First, use link_ksettings API if it is supported */
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev,
+ &link_ksettings);
+ if (err < 0)
+ return err;
+ if (!convert_link_ksettings_to_legacy_settings(&cmd,
+ &link_ksettings))
+ warn_incomplete_ethtool_legacy_settings_conversion(
+ "link modes are only partially reported");
+
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
+ } else {
+ /* driver doesn't support %ethtool_link_ksettings
+ * API. revert to legacy %ethtool_cmd API, unless it's
+ * not supported either.
+ */
+ int err;
+
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+ }
+
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
+
return 0;
}
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_cmd cmd;
- if (!dev->ethtool_ops->set_settings)
- return -EOPNOTSUPP;
+ ASSERT_RTNL();
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
+ /* first, try new %ethtool_link_ksettings API. */
+ if (dev->ethtool_ops->set_link_ksettings) {
+ struct ethtool_link_ksettings link_ksettings;
+
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+ &cmd))
+ return -EINVAL;
+
+ link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+ return dev->ethtool_ops->set_link_ksettings(dev,
+ &link_ksettings);
+ }
+
+ /* legacy %ethtool_cmd API */
+
+ /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+ * disappears internally
+ */
+
+ if (!dev->ethtool_ops->set_settings)
+ return -EOPNOTSUPP;
+
return dev->ethtool_ops->set_settings(dev, &cmd);
}
@@ -632,7 +1051,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
return 0;
}
-u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len)
{
@@ -642,6 +1061,37 @@ void netdev_rss_key_fill(void *buffer, size_t len)
}
EXPORT_SYMBOL(netdev_rss_key_fill);
+static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+ u32 dev_size, current_max = 0;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ while (dev_size--)
+ current_max = max(current_max, indir[dev_size]);
+
+ *max = current_max;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -738,6 +1188,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
}
ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (user_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
out:
kfree(indir);
@@ -897,6 +1355,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
out:
kfree(rss_config);
@@ -1227,14 +1693,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels;
+ struct ethtool_channels channels, max;
+ u32 max_rx_in_use = 0;
- if (!dev->ethtool_ops->set_channels)
+ if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
+ dev->ethtool_ops->get_channels(dev, &max);
+
+ /* ensure new counts are within the maximums */
+ if ((channels.rx_count > max.max_rx) ||
+ (channels.tx_count > max.max_tx) ||
+ (channels.combined_count > max.max_combined) ||
+ (channels.other_count > max.max_other))
+ return -EINVAL;
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table settings */
+ if (netif_is_rxfh_configured(dev) &&
+ !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+ (channels.combined_count + channels.rx_count) <= max_rx_in_use)
+ return -EINVAL;
+
return dev->ethtool_ops->set_channels(dev, &channels);
}
@@ -1823,13 +2306,121 @@ out:
return ret;
}
+static int ethtool_get_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int ret;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if (!dev->ethtool_ops->get_per_queue_coalesce)
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_u32array(queue_mask,
+ MAX_NUM_QUEUE,
+ per_queue_opt->queue_mask,
+ DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ return ret;
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ useraddr += sizeof(coalesce);
+ }
+
+ return 0;
+}
+
+static int ethtool_set_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int i, ret = 0;
+ int n_queue;
+ struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+ (!dev->ethtool_ops->get_per_queue_coalesce))
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_u32array(queue_mask,
+ MAX_NUM_QUEUE,
+ per_queue_opt->queue_mask,
+ DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+ n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+ tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+ if (!backup)
+ return -ENOMEM;
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce;
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+ if (ret != 0)
+ goto roll_back;
+
+ tmp++;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+ ret = -EFAULT;
+ goto roll_back;
+ }
+
+ ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ goto roll_back;
+
+ useraddr += sizeof(coalesce);
+ }
+
+roll_back:
+ if (ret != 0) {
+ tmp = backup;
+ for_each_set_bit(i, queue_mask, bit) {
+ dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+ tmp++;
+ }
+ }
+ kfree(backup);
+
+ return ret;
+}
+
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_per_queue_op per_queue_opt;
+
+ if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+ return -EFAULT;
+
+ switch (per_queue_opt.sub_command) {
+ case ETHTOOL_GCOALESCE:
+ return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ case ETHTOOL_SCOALESCE:
+ return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ default:
+ return -EOPNOTSUPP;
+ };
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
{
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
void __user *useraddr = ifr->ifr_data;
- u32 ethcmd;
+ u32 ethcmd, sub_cmd;
int rc;
netdev_features_t old_features;
@@ -1839,8 +2430,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
+ if (ethcmd == ETHTOOL_PERQUEUE) {
+ if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+ return -EFAULT;
+ } else {
+ sub_cmd = ethcmd;
+ }
/* Allow some commands to be done by anyone */
- switch (ethcmd) {
+ switch (sub_cmd) {
case ETHTOOL_GSET:
case ETHTOOL_GDRVINFO:
case ETHTOOL_GMSGLVL:
@@ -2070,6 +2667,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPHYSTATS:
rc = ethtool_get_phy_stats(dev, useraddr);
break;
+ case ETHTOOL_PERQUEUE:
+ rc = ethtool_set_per_queue(dev, useraddr);
+ break;
+ case ETHTOOL_GLINKSETTINGS:
+ rc = ethtool_get_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_SLINKSETTINGS:
+ rc = ethtool_set_link_ksettings(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index fb2951c35..ca7f832b2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -530,12 +530,14 @@ do_pass:
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
- /* RET_K, RET_A are remaped into 2 insns. */
+ /* RET_K is remaped into 2 insns. RET_A case doesn't need an
+ * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
+ */
case BPF_RET | BPF_A:
case BPF_RET | BPF_K:
- *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
- BPF_K : BPF_X, BPF_REG_0,
- BPF_REG_A, fp->k);
+ if (BPF_RVAL(fp->code) == BPF_K)
+ *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
+ 0, fp->k);
*insn = BPF_EXIT_INSN();
break;
@@ -1180,7 +1182,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
if (bpf_prog_size(prog->len) > sysctl_optmem_max)
return -ENOMEM;
- if (sk_unhashed(sk)) {
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
err = reuseport_alloc(sk);
if (err)
return err;
@@ -1338,18 +1340,25 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
-#define BPF_LDST_LEN 16U
+struct bpf_scratchpad {
+ union {
+ __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
+ u8 buff[MAX_BPF_STACK];
+ };
+};
+
+static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
struct sk_buff *skb = (struct sk_buff *) (long) r1;
int offset = (int) r2;
void *from = (void *) (long) r3;
unsigned int len = (unsigned int) r4;
- char buf[BPF_LDST_LEN];
void *ptr;
- if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
/* bpf verifier guarantees that:
@@ -1360,14 +1369,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
*
* so check for invalid 'offset' and too large 'len'
*/
- if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
+ if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + len)))
+ if (unlikely(skb_try_make_writable(skb, offset + len)))
return -EFAULT;
- ptr = skb_header_pointer(skb, offset, len, buf);
+ ptr = skb_header_pointer(skb, offset, len, sp->buff);
if (unlikely(!ptr))
return -EFAULT;
@@ -1376,17 +1383,19 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
memcpy(ptr, from, len);
- if (ptr == buf)
+ if (ptr == sp->buff)
/* skb_store_bits cannot return -EFAULT here */
skb_store_bits(skb, offset, ptr, len);
if (flags & BPF_F_RECOMPUTE_CSUM)
skb_postpush_rcsum(skb, ptr, len);
+ if (flags & BPF_F_INVALIDATE_HASH)
+ skb_clear_hash(skb);
return 0;
}
-const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.func = bpf_skb_store_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1405,7 +1414,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
unsigned int len = (unsigned int) r4;
void *ptr;
- if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN))
+ if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, len, to);
@@ -1417,7 +1426,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return 0;
}
-const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.func = bpf_skb_load_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1437,9 +1446,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(sum))))
+ if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1447,6 +1454,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EFAULT;
switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ csum_replace_by_diff(ptr, to);
+ break;
case 2:
csum_replace2(ptr, from, to);
break;
@@ -1464,7 +1477,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.func = bpf_l3_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1479,23 +1492,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+ bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
int offset = (int) r2;
__sum16 sum, *ptr;
- if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+ BPF_F_HDR_FIELD_MASK)))
return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(sum))))
+ if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
if (unlikely(!ptr))
return -EFAULT;
+ if (is_mmzero && !*ptr)
+ return 0;
switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ break;
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
break;
@@ -1506,6 +1527,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
}
+ if (is_mmzero && !*ptr)
+ *ptr = CSUM_MANGLED_0;
if (ptr == &sum)
/* skb_store_bits guaranteed to not return -EFAULT here */
skb_store_bits(skb, offset, ptr, sizeof(sum));
@@ -1513,7 +1536,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.func = bpf_l4_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1524,6 +1547,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
+ u64 diff_size = from_size + to_size;
+ __be32 *from = (__be32 *) (long) r1;
+ __be32 *to = (__be32 *) (long) r3;
+ int i, j = 0;
+
+ /* This is quite flexible, some examples:
+ *
+ * from_size == 0, to_size > 0, seed := csum --> pushing data
+ * from_size > 0, to_size == 0, seed := csum --> pulling data
+ * from_size > 0, to_size > 0, seed := 0 --> diffing data
+ *
+ * Even for diffing, from_size and to_size don't need to be equal.
+ */
+ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+ diff_size > sizeof(sp->diff)))
+ return -EINVAL;
+
+ for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = ~from[i];
+ for (i = 0; i < to_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = to[i];
+
+ return csum_partial(sp->diff, diff_size, seed);
+}
+
+static const struct bpf_func_proto bpf_csum_diff_proto = {
+ .func = bpf_csum_diff,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -1548,11 +1610,10 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
}
skb2->dev = dev;
- skb_sender_cpu_clear(skb2);
return dev_queue_xmit(skb2);
}
-const struct bpf_func_proto bpf_clone_redirect_proto = {
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
.func = bpf_clone_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1601,11 +1662,10 @@ int skb_do_redirect(struct sk_buff *skb)
}
skb->dev = dev;
- skb_sender_cpu_clear(skb);
return dev_queue_xmit(skb);
}
-const struct bpf_func_proto bpf_redirect_proto = {
+static const struct bpf_func_proto bpf_redirect_proto = {
.func = bpf_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1627,14 +1687,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
-#ifdef CONFIG_IP_ROUTE_CLASSID
- const struct dst_entry *dst;
-
- dst = skb_dst((struct sk_buff *) (unsigned long) r1);
- if (dst)
- return dst->tclassid;
-#endif
- return 0;
+ return dst_tclassid((struct sk_buff *) (unsigned long) r1);
}
static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1687,6 +1740,13 @@ bool bpf_helper_changes_skb_data(void *func)
return true;
if (func == bpf_skb_vlan_pop)
return true;
+ if (func == bpf_skb_store_bytes)
+ return true;
+ if (func == bpf_l3_csum_replace)
+ return true;
+ if (func == bpf_l4_csum_replace)
+ return true;
+
return false;
}
@@ -1708,12 +1768,16 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return -EPROTO;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ goto set_compat;
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
/* Fixup deprecated structure layouts here, so we have
* a common path later on.
*/
if (ip_tunnel_info_af(info) != AF_INET)
return -EINVAL;
+set_compat:
to = (struct bpf_tunnel_key *)compat;
break;
default:
@@ -1725,11 +1789,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- if (flags & BPF_F_TUNINFO_IPV6)
+ if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
sizeof(to->remote_ipv6));
- else
+ to->tunnel_label = be32_to_cpu(info->key.label);
+ } else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ }
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
memcpy((void *)(long) r2, to, size);
@@ -1737,7 +1803,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return 0;
}
-const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.func = bpf_skb_get_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1747,6 +1813,32 @@ const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *to = (u8 *) (long) r2;
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!info ||
+ !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+ return -ENOENT;
+ if (unlikely(size < info->options_len))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_get(to, info);
+
+ return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+ .func = bpf_skb_get_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
static struct metadata_dst __percpu *md_dst;
static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1757,10 +1849,13 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
- if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_DONT_FRAGMENT)))
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
/* Fixup deprecated structure layouts here, so we have
* a common path later on.
@@ -1773,6 +1868,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return -EINVAL;
}
}
+ if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
+ from->tunnel_ext))
+ return -EINVAL;
skb_dst_drop(skb);
dst_hold((struct dst_entry *) md);
@@ -1781,7 +1879,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info = &md->u.tun_info;
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+ info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ if (flags & BPF_F_DONT_FRAGMENT)
+ info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
info->key.ttl = from->tunnel_ttl;
@@ -1790,6 +1891,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info->mode |= IP_TUNNEL_INFO_IPV6;
memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
sizeof(from->remote_ipv6));
+ info->key.label = cpu_to_be32(from->tunnel_label) &
+ IPV6_FLOWLABEL_MASK;
} else {
info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
if (flags & BPF_F_ZERO_CSUM_TX)
@@ -1799,7 +1902,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return 0;
}
-const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.func = bpf_skb_set_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1809,17 +1912,53 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *from = (u8 *) (long) r2;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+ if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+ return -EINVAL;
+ if (unlikely(size > IP_TUNNEL_OPTS_MAX))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_set(info, from, size);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+ .func = bpf_skb_set_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
if (!md_dst) {
- /* race is not possible, since it's called from
- * verifier that is holding verifier mutex
+ /* Race is not possible, since it's called from verifier
+ * that is holding verifier mutex.
*/
- md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+ md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+ GFP_KERNEL);
if (!md_dst)
return NULL;
}
- return &bpf_skb_set_tunnel_key_proto;
+
+ switch (which) {
+ case BPF_FUNC_skb_set_tunnel_key:
+ return &bpf_skb_set_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return &bpf_skb_set_tunnel_opt_proto;
+ default:
+ return NULL;
+ }
}
static const struct bpf_func_proto *
@@ -1856,6 +1995,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -1871,7 +2012,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
- return bpf_get_skb_set_tunnel_key_proto();
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
@@ -1920,16 +2065,14 @@ static bool sk_filter_is_valid_access(int off, int size,
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type)
{
- if (off == offsetof(struct __sk_buff, tc_classid))
- return type == BPF_WRITE ? true : false;
-
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, mark):
case offsetof(struct __sk_buff, tc_index):
case offsetof(struct __sk_buff, priority):
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
+ case offsetof(struct __sk_buff, tc_classid):
break;
default:
return false;
@@ -2046,8 +2189,10 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
ctx_off -= offsetof(struct __sk_buff, tc_classid);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
- WARN_ON(type != BPF_WRITE);
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
break;
case offsetof(struct __sk_buff, tc_index):
diff --git a/net/core/flow.c b/net/core/flow.c
index 1033725be..3937b1b68 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -92,8 +92,11 @@ static void flow_cache_gc_task(struct work_struct *work)
list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
- list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+ list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
flow_entry_kill(fce, xfrm);
+ atomic_dec(&xfrm->flow_cache_gc_count);
+ WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
+ }
}
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
@@ -101,6 +104,7 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
struct netns_xfrm *xfrm)
{
if (deleted) {
+ atomic_add(deleted, &xfrm->flow_cache_gc_count);
fcp->hash_count -= deleted;
spin_lock_bh(&xfrm->flow_cache_gc_lock);
list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
@@ -232,6 +236,13 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
if (fcp->hash_count > fc->high_watermark)
flow_cache_shrink(fc, fcp);
+ if (fcp->hash_count > 2 * fc->high_watermark ||
+ atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
+ atomic_inc(&net->xfrm.flow_cache_genid);
+ flo = ERR_PTR(-ENOBUFS);
+ goto ret_object;
+ }
+
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
if (fle) {
fle->net = net;
@@ -446,6 +457,7 @@ int flow_cache_init(struct net *net)
INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
mutex_init(&net->xfrm.flow_flush_sem);
+ atomic_set(&net->xfrm.flow_cache_gc_count, 0);
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 12e700332..a669dea14 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,25 +19,12 @@
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
-static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id)
-{
- return flow_dissector->used_keys & (1 << key_id);
-}
-
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
{
flow_dissector->used_keys |= (1 << key_id);
}
-static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id,
- void *target_container)
-{
- return ((char *) target_container) + flow_dissector->offset[key_id];
-}
-
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count)
@@ -178,15 +165,16 @@ ip:
ip_proto = iph->protocol;
- if (!dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS))
- break;
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
- key_addrs = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
- key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ memcpy(&key_addrs->v4addrs, &iph->saddr,
+ sizeof(key_addrs->v4addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ }
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -219,13 +207,12 @@ ipv6:
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
- struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
-
- key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV6_ADDRS,
- target_container);
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
- memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+ memcpy(&key_addrs->v6addrs, &iph->saddr,
+ sizeof(key_addrs->v6addrs));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -339,8 +326,11 @@ mpls:
}
case htons(ETH_P_FCOE):
- key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
- /* fall through */
+ if ((hlen - nhoff) < FCOE_HEADER_LEN)
+ goto out_bad;
+
+ nhoff += FCOE_HEADER_LEN;
+ goto out_good;
default:
goto out_bad;
}
@@ -447,13 +437,12 @@ ip_proto_again:
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
nhoff += sizeof(_fh);
+ ip_proto = fh->nexthdr;
if (!(fh->frag_off & htons(IP6_OFFSET))) {
key_control->flags |= FLOW_DIS_FIRST_FRAG;
- if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
- ip_proto = fh->nexthdr;
+ if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
goto ip_proto_again;
- }
}
goto out_good;
}
@@ -740,6 +729,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
{
u32 poff = keys->control.thoff;
+ /* skip L4 headers for fragments after the first */
+ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
+ !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
+ return poff;
+
switch (keys->basic.ip_proto) {
case IPPROTO_TCP: {
/* access doff as u8 to avoid unaligned access */
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 92d886f4a..4573d8109 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -191,6 +191,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
/**
* gen_new_estimator - create a new rate estimator
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
@@ -287,6 +288,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
/**
* gen_replace_estimator - replace rate estimator configuration
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 1e2f46a69..e640462ea 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -140,6 +140,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
+ * @cpu: copy statistic per cpu
* @b: basic statistics
*
* Appends the basic statistics to the top level TLV created by
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000..941c28486
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,87 @@
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+ if (likely(bm_pool->frag_size <= PAGE_SIZE))
+ skb_free_frag(buf);
+ else
+ kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+ int frag_size = bm_pool->frag_size;
+ void *buf;
+
+ if (likely(frag_size <= PAGE_SIZE))
+ buf = netdev_alloc_frag(frag_size);
+ else
+ buf = kmalloc(frag_size, gfp);
+
+ if (!buf)
+ return -ENOMEM;
+
+ if (bm_pool->construct)
+ if (bm_pool->construct(bm_pool, buf)) {
+ hwbm_buf_free(bm_pool, buf);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{
+ int err, i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bm_pool->lock, flags);
+ if (bm_pool->buf_num == bm_pool->size) {
+ pr_warn("pool already filled\n");
+ return bm_pool->buf_num;
+ }
+
+ if (buf_num + bm_pool->buf_num > bm_pool->size) {
+ pr_warn("cannot allocate %d buffers for pool\n",
+ buf_num);
+ return 0;
+ }
+
+ if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+ pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+ buf_num, bm_pool->buf_num);
+ return 0;
+ }
+
+ for (i = 0; i < buf_num; i++) {
+ err = hwbm_pool_refill(bm_pool, gfp);
+ if (err < 0)
+ break;
+ }
+
+ /* Update BM driver with number of buffers added to pool */
+ bm_pool->buf_num += i;
+
+ pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 299cfc24d..669ecc9f8 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -27,6 +27,31 @@
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+ /* Only lwt encaps implemented without using an interface for
+ * the encap need to return a string here.
+ */
+ switch (encap_type) {
+ case LWTUNNEL_ENCAP_MPLS:
+ return "MPLS";
+ case LWTUNNEL_ENCAP_ILA:
+ return "ILA";
+ case LWTUNNEL_ENCAP_IP6:
+ case LWTUNNEL_ENCAP_IP:
+ case LWTUNNEL_ENCAP_NONE:
+ case __LWTUNNEL_ENCAP_MAX:
+ /* should not have got here */
+ WARN_ON(1);
+ break;
+ }
+ return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
{
struct lwtunnel_state *lws;
@@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+ if (encap_type_str) {
+ rcu_read_unlock();
+ request_module("rtnl-lwt-%s", encap_type_str);
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ }
+ }
+#endif
if (likely(ops && ops->build_state))
ret = ops->build_state(dev, encap, family, cfg, lws);
rcu_read_unlock();
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b6c8a6629..2b3f76fe6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -29,7 +29,6 @@
#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
-static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
@@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev,
return restart_syscall();
if (netif_running(netdev)) {
- struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd))
+ ret = sprintf(buf, fmt_dec, cmd.base.speed);
}
rtnl_unlock();
return ret;
@@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev,
return restart_syscall();
if (netif_running(netdev)) {
- struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd)) {
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd)) {
const char *duplex;
- switch (cmd.duplex) {
+
+ switch (cmd.base.duplex) {
case DUPLEX_HALF:
duplex = "half";
break;
@@ -574,6 +576,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
NETSTAT_ENTRY(tx_window_errors);
NETSTAT_ENTRY(rx_compressed);
NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
static struct attribute *netstat_attrs[] = {
&dev_attr_rx_packets.attr,
@@ -599,6 +602,7 @@ static struct attribute *netstat_attrs[] = {
&dev_attr_tx_window_errors.attr,
&dev_attr_rx_compressed.attr,
&dev_attr_tx_compressed.attr,
+ &dev_attr_rx_nohandler.attr,
NULL
};
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 0260c84ed..11fce1727 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -9,7 +9,6 @@
* Authors: Thomas Graf <tgraf@suug.ch>
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fdtable.h>
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index f1efbc39e..2ec86fc55 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -11,7 +11,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1474cfd2d..20999aa59 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2856,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IP);
}
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
@@ -2983,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IPV6);
}
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 215e6137f..65763c29f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
a->rx_compressed = b->rx_compressed;
a->tx_compressed = b->tx_compressed;
+
+ a->rx_nohandler = b->rx_nohandler;
}
static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
@@ -893,6 +895,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
+ + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1176,14 +1180,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
{
- struct rtnl_link_ifmap map = {
- .mem_start = dev->mem_start,
- .mem_end = dev->mem_end,
- .base_addr = dev->base_addr,
- .irq = dev->irq,
- .dma = dev->dma,
- .port = dev->if_port,
- };
+ struct rtnl_link_ifmap map;
+
+ memset(&map, 0, sizeof(map));
+ map.mem_start = dev->mem_start;
+ map.mem_end = dev->mem_end;
+ map.base_addr = dev->base_addr;
+ map.irq = dev->irq;
+ map.dma = dev->dma;
+ map.port = dev->if_port;
+
if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
return -EMSGSIZE;
@@ -1222,6 +1228,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
#ifdef CONFIG_RPS
nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
#endif
@@ -1388,15 +1396,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
[IFLA_VF_STATS] = { .type = NLA_NESTED },
[IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) },
-};
-
-static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
- [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 },
- [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 },
- [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 },
- [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 },
- [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 },
- [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 },
+ [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+ [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1413,6 +1414,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+{
+ const struct rtnl_link_ops *ops = NULL;
+ struct nlattr *linfo[IFLA_INFO_MAX + 1];
+
+ if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0)
+ return NULL;
+
+ if (linfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
+ nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ }
+
+ return ops;
+}
+
+static bool link_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool link_kind_filtered(const struct net_device *dev,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (kind_ops && dev->rtnl_link_ops != kind_ops)
+ return true;
+
+ return false;
+}
+
+static bool link_dump_filtered(struct net_device *dev,
+ int master_idx,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (link_master_filtered(dev, master_idx) ||
+ link_kind_filtered(dev, kind_ops))
+ return true;
+
+ return false;
+}
+
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -1422,6 +1475,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
struct hlist_head *head;
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
+ const struct rtnl_link_ops *kind_ops = NULL;
+ unsigned int flags = NLM_F_MULTI;
+ int master_idx = 0;
int err;
int hdrlen;
@@ -1444,18 +1500,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ if (tb[IFLA_MASTER])
+ master_idx = nla_get_u32(tb[IFLA_MASTER]);
+
+ if (tb[IFLA_LINKINFO])
+ kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
+
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
}
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &net->dev_index_head[h];
hlist_for_each_entry(dev, head, index_hlist) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
if (idx < s_idx)
goto cont;
err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI,
+ flags,
ext_filter_mask);
/* If we ran out of room on the first message,
* we're in trouble
@@ -1535,6 +1602,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return 0;
}
+static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+ int guid_type)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+}
+
+static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+{
+ if (dev->type != ARPHRD_INFINIBAND)
+ return -EOPNOTSUPP;
+
+ return handle_infiniband_guid(dev, ivt, guid_type);
+}
+
static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1637,6 +1720,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
return err;
}
+ if (tb[IFLA_VF_IB_NODE_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+ }
+
+ if (tb[IFLA_VF_IB_PORT_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+ }
+
return err;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8616d1147..e561f9f07 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -349,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(build_skb);
+#define NAPI_SKB_CACHE_SIZE 64
+
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ size_t skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
@@ -380,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(nc, fragsz, gfp_mask);
+ return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -476,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
void *data;
@@ -496,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = __alloc_page_frag(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -507,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
/* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->pfmemalloc)
+ if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -749,6 +757,73 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
+void __kfree_skb_flush(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* flush skb_cache if containing objects */
+ if (nc->skb_count) {
+ kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+
+static inline void _kfree_skb_defer(struct sk_buff *skb)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* drop skb->head and call any destructors for packet */
+ skb_release_all(skb);
+
+ /* record skb to CPU local list */
+ nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+ /* SLUB writes into objects when freeing */
+ prefetchw(skb);
+#endif
+
+ /* flush skb_cache if it is filled */
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+ _kfree_skb_defer(skb);
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+ if (unlikely(!skb))
+ return;
+
+ /* Zero budget indicate non-NAPI context called us, like netpoll */
+ if (unlikely(!budget)) {
+ dev_consume_skb_any(skb);
+ return;
+ }
+
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ /* if reaching here SKB is ready to free */
+ trace_consume_skb(skb);
+
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ _kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
@@ -1843,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
+ struct sk_buff *iter;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -1869,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return true;
}
+ skb_walk_frags(skb, iter) {
+ if (*offset >= iter->len) {
+ *offset -= iter->len;
+ continue;
+ }
+ /* __skb_splice_bits() only fails if the output has no room
+ * left, so no point in going over the frag_list for the error
+ * case.
+ */
+ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ return true;
+ }
+
return false;
}
@@ -1895,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk,
/*
* Map data from the skb to a pipe. Should handle both the linear part,
- * the fragments, and the frag list. It does NOT handle frag lists within
- * the frag list, if such a thing exists. We'd probably need to recurse to
- * handle that cleanly.
+ * the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
@@ -1916,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
.ops = &nosteal_pipe_buf_ops,
.spd_release = sock_spd_release,
};
- struct sk_buff *frag_iter;
int ret = 0;
- /*
- * __skb_splice_bits() only fails if the output has no room left,
- * so no point in going over the frag_list for the error case.
- */
- if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
- goto done;
- else if (!tlen)
- goto done;
-
- /*
- * now see if we have a frag_list to map
- */
- skb_walk_frags(skb, frag_iter) {
- if (!tlen)
- break;
- if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
- break;
- }
+ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
-done:
if (spd.nr_pages)
ret = splice_cb(sk, pipe, &spd);
@@ -3024,8 +3092,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
- csum = !head_skb->encap_hdr_csum &&
- !!can_checksum_protocol(features, proto);
+ csum = !!can_checksum_protocol(features, proto);
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3118,13 +3185,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (nskb->len == len + doffset)
goto perform_csum_check;
- if (!sg && !nskb->remcsum_offload) {
- nskb->ip_summed = CHECKSUM_NONE;
- nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
+ if (!sg) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len, 0);
SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ skb_headroom(nskb) + doffset;
continue;
}
@@ -3190,12 +3259,19 @@ skip_fraglist:
nskb->truesize += nskb->data_len;
perform_csum_check:
- if (!csum && !nskb->remcsum_offload) {
- nskb->csum = skb_checksum(nskb, doffset,
- nskb->len - doffset, 0);
- nskb->ip_summed = CHECKSUM_NONE;
+ if (!csum) {
+ if (skb_has_shared_frag(nskb)) {
+ err = __skb_linearize(nskb);
+ if (err)
+ goto err;
+ }
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
@@ -4237,7 +4313,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- skb_sender_cpu_clear(skb);
secpath_reset(skb);
nf_reset(skb);
nf_reset_trace(skb);
@@ -4427,15 +4502,16 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
__skb_push(skb, offset);
err = __vlan_insert_tag(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
- if (err)
+ if (err) {
+ __skb_pull(skb, offset);
return err;
+ }
+
skb->protocol = skb->vlan_proto;
skb->mac_len += VLAN_HLEN;
- __skb_pull(skb, offset);
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(skb->data
- + (2 * ETH_ALEN), VLAN_HLEN, 0));
+ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+ __skb_pull(skb, offset);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc93..7e73c26b6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -221,7 +221,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
"sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
"sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
+ "sk_lock-AF_MAX"
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
@@ -237,7 +238,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
"slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
"slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
+ "slock-AF_MAX"
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
@@ -253,7 +255,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
"clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
"clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
+ "clock-AF_MAX"
};
/*
@@ -987,6 +990,10 @@ set_rcvbuf:
sk->sk_incoming_cpu = val;
break;
+ case SO_CNX_ADVICE:
+ if (val == 1)
+ dst_negative_advice(sk);
+ break;
default:
ret = -ENOPROTOOPT;
break;
@@ -1531,6 +1538,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk = NULL;
goto out;
}
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
newsk->sk_err = 0;
newsk->sk_priority = 0;
@@ -1903,7 +1911,7 @@ EXPORT_SYMBOL(sock_cmsg_send);
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
if (pfrag->page) {
- if (atomic_read(&pfrag->page->_count) == 1) {
+ if (page_ref_count(pfrag->page) == 1) {
pfrag->offset = 0;
return true;
}