summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-03-25 03:53:42 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-03-25 03:53:42 -0300
commit03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch)
treefa581f6dc1c0596391690d1f67eceef3af8246dc /net/core
parentd4e493caf788ef44982e131ff9c786546904d934 (diff)
Linux-libre 4.5-gnu
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c77
-rw-r--r--net/core/dev.c418
-rw-r--r--net/core/ethtool.c85
-rw-r--r--net/core/filter.c298
-rw-r--r--net/core/net-sysfs.c5
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/netclassid_cgroup.c11
-rw-r--r--net/core/netprio_cgroup.c19
-rw-r--r--net/core/pktgen.c4
-rw-r--r--net/core/rtnetlink.c27
-rw-r--r--net/core/scm.c4
-rw-r--r--net/core/skbuff.c22
-rw-r--r--net/core/sock.c132
-rw-r--r--net/core/sock_diag.c23
-rw-r--r--net/core/sock_reuseport.c258
-rw-r--r--net/core/stream.c2
17 files changed, 1141 insertions, 250 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 086b01fbe..0b835de04 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o tso.o
+ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d62af69ad..fa9dc6450 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -83,8 +83,8 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn
/*
* Wait for the last received packet to be different from skb
*/
-static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
- const struct sk_buff *skb)
+int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+ const struct sk_buff *skb)
{
int error;
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
@@ -130,6 +130,7 @@ out_noerr:
error = 1;
goto out;
}
+EXPORT_SYMBOL(__skb_wait_for_more_packets);
static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
{
@@ -161,13 +162,15 @@ done:
}
/**
- * __skb_recv_datagram - Receive a datagram skbuff
+ * __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
+ * @last: set to last peeked message to inform the wait function
+ * what to look for when peeking
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
* and possible races. This replaces identical code in packet, raw and
@@ -175,9 +178,11 @@ done:
* the long standing peek and read race for datagram sockets. If you
* alter this routine remember it must be re-entrant.
*
- * This function will lock the socket if a skb is returned, so the caller
- * needs to unlock the socket in that case (usually by calling
- * skb_free_datagram)
+ * This function will lock the socket if a skb is returned, so
+ * the caller needs to unlock the socket in that case (usually by
+ * calling skb_free_datagram). Returns NULL with *err set to
+ * -EAGAIN if no data was available or to some other value if an
+ * error was detected.
*
* * It does not lock socket since today. This function is
* * free of race conditions. This measure should/can improve
@@ -191,13 +196,13 @@ done:
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
- int *peeked, int *off, int *err)
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err,
+ struct sk_buff **last)
{
struct sk_buff_head *queue = &sk->sk_receive_queue;
- struct sk_buff *skb, *last;
+ struct sk_buff *skb;
unsigned long cpu_flags;
- long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
@@ -206,8 +211,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -217,10 +220,10 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
*/
int _off = *off;
- last = (struct sk_buff *)queue;
+ *last = (struct sk_buff *)queue;
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
- last = skb;
+ *last = skb;
*peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
@@ -231,8 +234,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
skb = skb_set_peeked(skb);
error = PTR_ERR(skb);
- if (IS_ERR(skb))
- goto unlock_err;
+ if (IS_ERR(skb)) {
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
atomic_inc(&skb->users);
} else
@@ -242,25 +248,38 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
*off = _off;
return skb;
}
+
spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ } while (sk_can_busy_loop(sk) &&
+ sk_busy_loop(sk, flags & MSG_DONTWAIT));
- if (sk_can_busy_loop(sk) &&
- sk_busy_loop(sk, flags & MSG_DONTWAIT))
- continue;
+ error = -EAGAIN;
- /* User doesn't want to wait */
- error = -EAGAIN;
- if (!timeo)
- goto no_packet;
+no_packet:
+ *err = error;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_try_recv_datagram);
- } while (!wait_for_more_packets(sk, err, &timeo, last));
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err)
+{
+ struct sk_buff *skb, *last;
+ long timeo;
- return NULL;
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
+ &last);
+ if (skb)
+ return skb;
+
+ if (*err != -EAGAIN)
+ break;
+ } while (timeo &&
+ !__skb_wait_for_more_packets(sk, err, &timeo, last));
-unlock_err:
- spin_unlock_irqrestore(&queue->lock, cpu_flags);
-no_packet:
- *err = error;
return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index 9efbdb3ff..0ef061b2b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -96,6 +96,7 @@
#include <linux/skbuff.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dst.h>
@@ -137,6 +138,7 @@
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
+#include <linux/sctp.h>
#include "net-sysfs.h"
@@ -182,8 +184,8 @@ EXPORT_SYMBOL(dev_base_lock);
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);
-static unsigned int napi_gen_id;
-static DEFINE_HASHTABLE(napi_hash, 8);
+static unsigned int napi_gen_id = NR_CPUS;
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
@@ -1674,6 +1676,22 @@ void net_dec_ingress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
+#ifdef CONFIG_NET_EGRESS
+static struct static_key egress_needed __read_mostly;
+
+void net_inc_egress_queue(void)
+{
+ static_key_slow_inc(&egress_needed);
+}
+EXPORT_SYMBOL_GPL(net_inc_egress_queue);
+
+void net_dec_egress_queue(void)
+{
+ static_key_slow_dec(&egress_needed);
+}
+EXPORT_SYMBOL_GPL(net_dec_egress_queue);
+#endif
+
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
/* We are not allowed to call static_key_slow_dec() from irq context
@@ -2470,6 +2488,141 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+/* skb_csum_offload_check - Driver helper function to determine if a device
+ * with limited checksum offload capabilities is able to offload the checksum
+ * for a given packet.
+ *
+ * Arguments:
+ * skb - sk_buff for the packet in question
+ * spec - contains the description of what device can offload
+ * csum_encapped - returns true if the checksum being offloaded is
+ * encpasulated. That is it is checksum for the transport header
+ * in the inner headers.
+ * checksum_help - when set indicates that helper function should
+ * call skb_checksum_help if offload checks fail
+ *
+ * Returns:
+ * true: Packet has passed the checksum checks and should be offloadable to
+ * the device (a driver may still need to check for additional
+ * restrictions of its device)
+ * false: Checksum is not offloadable. If checksum_help was set then
+ * skb_checksum_help was called to resolve checksum for non-GSO
+ * packets and when IP protocol is not SCTP
+ */
+bool __skb_csum_offload_chk(struct sk_buff *skb,
+ const struct skb_csum_offl_spec *spec,
+ bool *csum_encapped,
+ bool csum_help)
+{
+ struct iphdr *iph;
+ struct ipv6hdr *ipv6;
+ void *nhdr;
+ int protocol;
+ u8 ip_proto;
+
+ if (skb->protocol == htons(ETH_P_8021Q) ||
+ skb->protocol == htons(ETH_P_8021AD)) {
+ if (!spec->vlan_okay)
+ goto need_help;
+ }
+
+ /* We check whether the checksum refers to a transport layer checksum in
+ * the outermost header or an encapsulated transport layer checksum that
+ * corresponds to the inner headers of the skb. If the checksum is for
+ * something else in the packet we need help.
+ */
+ if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
+ /* Non-encapsulated checksum */
+ protocol = eproto_to_ipproto(vlan_get_protocol(skb));
+ nhdr = skb_network_header(skb);
+ *csum_encapped = false;
+ if (spec->no_not_encapped)
+ goto need_help;
+ } else if (skb->encapsulation && spec->encap_okay &&
+ skb_checksum_start_offset(skb) ==
+ skb_inner_transport_offset(skb)) {
+ /* Encapsulated checksum */
+ *csum_encapped = true;
+ switch (skb->inner_protocol_type) {
+ case ENCAP_TYPE_ETHER:
+ protocol = eproto_to_ipproto(skb->inner_protocol);
+ break;
+ case ENCAP_TYPE_IPPROTO:
+ protocol = skb->inner_protocol;
+ break;
+ }
+ nhdr = skb_inner_network_header(skb);
+ } else {
+ goto need_help;
+ }
+
+ switch (protocol) {
+ case IPPROTO_IP:
+ if (!spec->ipv4_okay)
+ goto need_help;
+ iph = nhdr;
+ ip_proto = iph->protocol;
+ if (iph->ihl != 5 && !spec->ip_options_okay)
+ goto need_help;
+ break;
+ case IPPROTO_IPV6:
+ if (!spec->ipv6_okay)
+ goto need_help;
+ if (spec->no_encapped_ipv6 && *csum_encapped)
+ goto need_help;
+ ipv6 = nhdr;
+ nhdr += sizeof(*ipv6);
+ ip_proto = ipv6->nexthdr;
+ break;
+ default:
+ goto need_help;
+ }
+
+ip_proto_again:
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ if (!spec->tcp_okay ||
+ skb->csum_offset != offsetof(struct tcphdr, check))
+ goto need_help;
+ break;
+ case IPPROTO_UDP:
+ if (!spec->udp_okay ||
+ skb->csum_offset != offsetof(struct udphdr, check))
+ goto need_help;
+ break;
+ case IPPROTO_SCTP:
+ if (!spec->sctp_okay ||
+ skb->csum_offset != offsetof(struct sctphdr, checksum))
+ goto cant_help;
+ break;
+ case NEXTHDR_HOP:
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_DEST: {
+ u8 *opthdr = nhdr;
+
+ if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
+ goto need_help;
+
+ ip_proto = opthdr[0];
+ nhdr += (opthdr[1] + 1) << 3;
+
+ goto ip_proto_again;
+ }
+ default:
+ goto need_help;
+ }
+
+ /* Passed the tests for offloading checksum */
+ return true;
+
+need_help:
+ if (csum_help && !skb_shinfo(skb)->gso_size)
+ skb_checksum_help(skb);
+cant_help:
+ return false;
+}
+EXPORT_SYMBOL(__skb_csum_offload_chk);
+
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -2649,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
- features &= ~NETIF_F_ALL_CSUM;
+ features &= ~NETIF_F_CSUM_MASK;
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
@@ -2796,7 +2949,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
- if (!(features & NETIF_F_ALL_CSUM) &&
+ if (!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
@@ -2875,7 +3028,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
bool contended;
int rc;
- qdisc_pkt_len_init(skb);
qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
@@ -2933,7 +3085,8 @@ static void skb_update_prio(struct sk_buff *skb)
struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
if (!skb->priority && skb->sk && map) {
- unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+ unsigned int prioidx =
+ sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
if (prioidx < map->priomap_len)
skb->priority = map->priomap[prioidx];
@@ -2967,6 +3120,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_loopback_xmit);
+#ifdef CONFIG_NET_EGRESS
+static struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
+ struct tcf_result cl_res;
+
+ if (!cl)
+ return skb;
+
+ /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
+ * earlier by the caller.
+ */
+ qdisc_bstats_cpu_update(cl->q, skb);
+
+ switch (tc_classify(skb, cl, &cl_res, false)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
+ case TC_ACT_SHOT:
+ qdisc_qstats_cpu_drop(cl->q);
+ *ret = NET_XMIT_DROP;
+ goto drop;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *ret = NET_XMIT_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NULL;
+ case TC_ACT_REDIRECT:
+ /* No need to push/pop skb's mac_header here on egress! */
+ skb_do_redirect(skb);
+ *ret = NET_XMIT_SUCCESS;
+ return NULL;
+ default:
+ break;
+ }
+
+ return skb;
+}
+#endif /* CONFIG_NET_EGRESS */
+
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
@@ -3026,7 +3222,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
int queue_index = 0;
#ifdef CONFIG_XPS
- if (skb->sender_cpu == 0)
+ u32 sender_cpu = skb->sender_cpu - 1;
+
+ if (sender_cpu >= (u32)NR_CPUS)
skb->sender_cpu = raw_smp_processor_id() + 1;
#endif
@@ -3091,6 +3289,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_update_prio(skb);
+ qdisc_pkt_len_init(skb);
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+# ifdef CONFIG_NET_EGRESS
+ if (static_key_false(&egress_needed)) {
+ skb = sch_handle_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
+# endif
+#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
@@ -3112,9 +3321,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
-#endif
trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
@@ -3671,9 +3877,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-static inline struct sk_buff *handle_ing(struct sk_buff *skb,
- struct packet_type **pt_prev,
- int *ret, struct net_device *orig_dev)
+static inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev)
{
#ifdef CONFIG_NET_CLS_ACT
struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
@@ -3867,7 +4073,7 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
- skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
@@ -4361,6 +4567,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
+ skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb);
@@ -4394,7 +4601,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
if (!skb) {
skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
- napi->skb = skb;
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
}
return skb;
}
@@ -4669,7 +4879,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
+static struct napi_struct *napi_by_id(unsigned int napi_id)
{
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
struct napi_struct *napi;
@@ -4680,43 +4890,101 @@ struct napi_struct *napi_by_id(unsigned int napi_id)
return NULL;
}
-EXPORT_SYMBOL_GPL(napi_by_id);
-void napi_hash_add(struct napi_struct *napi)
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+#define BUSY_POLL_BUDGET 8
+bool sk_busy_loop(struct sock *sk, int nonblock)
{
- if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+ unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+ int (*busy_poll)(struct napi_struct *dev);
+ struct napi_struct *napi;
+ int rc = false;
- spin_lock(&napi_hash_lock);
+ rcu_read_lock();
- /* 0 is not a valid id, we also skip an id that is taken
- * we expect both events to be extremely rare
- */
- napi->napi_id = 0;
- while (!napi->napi_id) {
- napi->napi_id = ++napi_gen_id;
- if (napi_by_id(napi->napi_id))
- napi->napi_id = 0;
+ napi = napi_by_id(sk->sk_napi_id);
+ if (!napi)
+ goto out;
+
+ /* Note: ndo_busy_poll method is optional in linux-4.5 */
+ busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
+
+ do {
+ rc = 0;
+ local_bh_disable();
+ if (busy_poll) {
+ rc = busy_poll(napi);
+ } else if (napi_schedule_prep(napi)) {
+ void *have = netpoll_poll_lock(napi);
+
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi);
+ if (rc == BUSY_POLL_BUDGET) {
+ napi_complete_done(napi, rc);
+ napi_schedule(napi);
+ }
+ }
+ netpoll_poll_unlock(have);
}
+ if (rc > 0)
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+ local_bh_enable();
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ if (rc == LL_FLUSH_FAILED)
+ break; /* permanent failure */
- spin_unlock(&napi_hash_lock);
- }
+ cpu_relax();
+ } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
+ !need_resched() && !busy_loop_timeout(end_time));
+
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+out:
+ rcu_read_unlock();
+ return rc;
+}
+EXPORT_SYMBOL(sk_busy_loop);
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+void napi_hash_add(struct napi_struct *napi)
+{
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
+ test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ return;
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
+ do {
+ if (unlikely(++napi_gen_id < NR_CPUS + 1))
+ napi_gen_id = NR_CPUS + 1;
+ } while (napi_by_id(napi_gen_id));
+ napi->napi_id = napi_gen_id;
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
}
EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-void napi_hash_del(struct napi_struct *napi)
+bool napi_hash_del(struct napi_struct *napi)
{
+ bool rcu_sync_needed = false;
+
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
+ rcu_sync_needed = true;
hlist_del_rcu(&napi->napi_hash_node);
-
+ }
spin_unlock(&napi_hash_lock);
+ return rcu_sync_needed;
}
EXPORT_SYMBOL_GPL(napi_hash_del);
@@ -4752,6 +5020,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
+ napi_hash_add(napi);
}
EXPORT_SYMBOL(netif_napi_add);
@@ -4771,8 +5040,12 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
+/* Must be called in process context */
void netif_napi_del(struct napi_struct *napi)
{
+ might_sleep();
+ if (napi_hash_del(napi))
+ synchronize_net();
list_del_init(&napi->dev_list);
napi_free_frags(napi);
@@ -5106,12 +5379,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+ lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
- *iter = &lower->list;
+ *iter = lower->list.next;
return lower->dev;
}
@@ -5359,7 +5632,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
- void *private)
+ void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
struct netdev_adjacent *i, *j, *to_i, *to_j;
@@ -5383,6 +5656,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
changeupper_info.upper_dev = upper_dev;
changeupper_info.master = master;
changeupper_info.linking = true;
+ changeupper_info.upper_info = upper_info;
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
&changeupper_info.info);
@@ -5390,7 +5664,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
return ret;
- ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
master);
if (ret)
return ret;
@@ -5428,8 +5702,12 @@ static int __netdev_upper_dev_link(struct net_device *dev,
goto rollback_lower_mesh;
}
- call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
- &changeupper_info.info);
+ ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto rollback_lower_mesh;
+
return 0;
rollback_lower_mesh:
@@ -5483,7 +5761,7 @@ rollback_mesh:
int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev)
{
- return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
+ return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
@@ -5491,6 +5769,8 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
* netdev_master_upper_dev_link - Add a master link to the upper device
* @dev: device
* @upper_dev: new upper device
+ * @upper_priv: upper device private
+ * @upper_info: upper info to be passed down via notifier
*
* Adds a link to device which is upper to this one. In this case, only
* one master upper device can be linked, although other non-master devices
@@ -5499,20 +5779,14 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
* counts are adjusted and the function returns zero.
*/
int netdev_master_upper_dev_link(struct net_device *dev,
- struct net_device *upper_dev)
+ struct net_device *upper_dev,
+ void *upper_priv, void *upper_info)
{
- return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
+ return __netdev_upper_dev_link(dev, upper_dev, true,
+ upper_priv, upper_info);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
-int netdev_master_upper_dev_link_private(struct net_device *dev,
- struct net_device *upper_dev,
- void *private)
-{
- return __netdev_upper_dev_link(dev, upper_dev, true, private);
-}
-EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
-
/**
* netdev_upper_dev_unlink - Removes a link to upper device
* @dev: device
@@ -5671,7 +5945,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private);
int dev_get_nest_level(struct net_device *dev,
- bool (*type_check)(struct net_device *dev))
+ bool (*type_check)(const struct net_device *dev))
{
struct net_device *lower = NULL;
struct list_head *iter;
@@ -5693,6 +5967,26 @@ int dev_get_nest_level(struct net_device *dev,
}
EXPORT_SYMBOL(dev_get_nest_level);
+/**
+ * netdev_lower_change - Dispatch event about lower device state change
+ * @lower_dev: device
+ * @lower_state_info: state to dispatch
+ *
+ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_lower_state_changed(struct net_device *lower_dev,
+ void *lower_state_info)
+{
+ struct netdev_notifier_changelowerstate_info changelowerstate_info;
+
+ ASSERT_RTNL();
+ changelowerstate_info.lower_state_info = lower_state_info;
+ call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+ &changelowerstate_info.info);
+}
+EXPORT_SYMBOL(netdev_lower_state_changed);
+
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -6383,9 +6677,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
/* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
/* maybe split UFO into V4 and V6? */
- if (!((features & NETIF_F_GEN_CSUM) ||
- (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
- == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ if (!(features & NETIF_F_HW_CSUM) &&
+ ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
+ (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
netdev_dbg(dev,
"Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
@@ -7174,11 +7468,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
* This function does the last stage of destroying an allocated device
* interface. The reference to the device object is released.
* If this is the last reference then it will be freed.
+ * Must be called in process context.
*/
void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
+ might_sleep();
netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
kvfree(dev->_rx);
@@ -7487,16 +7783,16 @@ static int dev_cpu_callback(struct notifier_block *nfb,
netdev_features_t netdev_increment_features(netdev_features_t all,
netdev_features_t one, netdev_features_t mask)
{
- if (mask & NETIF_F_GEN_CSUM)
- mask |= NETIF_F_ALL_CSUM;
+ if (mask & NETIF_F_HW_CSUM)
+ mask |= NETIF_F_CSUM_MASK;
mask |= NETIF_F_VLAN_CHALLENGED;
- all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+ all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
/* If one device supports hw checksumming, set for all. */
- if (all & NETIF_F_GEN_CSUM)
- all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+ if (all & NETIF_F_HW_CSUM)
+ all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
return all;
}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 29edf7484..daf04709d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -87,7 +87,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
- [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
[NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
[NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
[NETIF_F_RXHASH_BIT] = "rx-hashing",
@@ -191,6 +191,23 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int phy_get_sset_count(struct phy_device *phydev)
+{
+ int ret;
+
+ if (phydev->drv->get_sset_count &&
+ phydev->drv->get_strings &&
+ phydev->drv->get_stats) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_sset_count(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -204,6 +221,13 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_TUNABLES)
return ARRAY_SIZE(tunable_strings);
+ if (sset == ETH_SS_PHY_STATS) {
+ if (dev->phydev)
+ return phy_get_sset_count(dev->phydev);
+ else
+ return -EOPNOTSUPP;
+ }
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -223,7 +247,17 @@ static void __ethtool_get_strings(struct net_device *dev,
sizeof(rss_hash_func_strings));
else if (stringset == ETH_SS_TUNABLES)
memcpy(data, tunable_strings, sizeof(tunable_strings));
- else
+ else if (stringset == ETH_SS_PHY_STATS) {
+ struct phy_device *phydev = dev->phydev;
+
+ if (phydev) {
+ mutex_lock(&phydev->lock);
+ phydev->drv->get_strings(phydev, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ return;
+ }
+ } else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -235,7 +269,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
switch (eth_cmd) {
case ETHTOOL_GTXCSUM:
case ETHTOOL_STXCSUM:
- return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC;
case ETHTOOL_GRXCSUM:
case ETHTOOL_SRXCSUM:
return NETIF_F_RXCSUM;
@@ -1401,6 +1435,47 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_stats stats;
+ struct phy_device *phydev = dev->phydev;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ n_stats = phy_get_sset_count(phydev);
+
+ if (n_stats < 0)
+ return n_stats;
+ WARN_ON(n_stats == 0);
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+ data = kmalloc_array(n_stats, sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ mutex_lock(&phydev->lock);
+ phydev->drv->get_stats(phydev, &stats, data);
+ mutex_unlock(&phydev->lock);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
{
struct ethtool_perm_addr epaddr;
@@ -1779,6 +1854,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GSSET_INFO:
case ETHTOOL_GSTRINGS:
case ETHTOOL_GSTATS:
+ case ETHTOOL_GPHYSTATS:
case ETHTOOL_GTSO:
case ETHTOOL_GPERMADDR:
case ETHTOOL_GUFO:
@@ -1991,6 +2067,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_STUNABLE:
rc = ethtool_set_tunable(dev, useraddr);
break;
+ case ETHTOOL_GPHYSTATS:
+ rc = ethtool_get_phy_stats(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 37157c4c1..bba502f7c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -50,6 +50,7 @@
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
+#include <net/sock_reuseport.h>
/**
* sk_filter - run a packet through a socket filter
@@ -348,12 +349,6 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* jump offsets, 2nd pass remapping:
* new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
* bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
- *
- * User BPF's register A is mapped to our BPF register 6, user BPF
- * register X is mapped to BPF register 7; frame pointer is always
- * register 10; Context 'void *ctx' is stored in register 1, that is,
- * for socket filters: ctx == 'struct sk_buff *', for seccomp:
- * ctx == 'struct seccomp_data *'.
*/
static int bpf_convert_filter(struct sock_filter *prog, int len,
struct bpf_insn *new_prog, int *new_len)
@@ -381,9 +376,22 @@ do_pass:
new_insn = new_prog;
fp = prog;
- if (new_insn)
- *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
- new_insn++;
+ /* Classic BPF related prologue emission. */
+ if (new_insn) {
+ /* Classic BPF expects A and X to be reset first. These need
+ * to be guaranteed to be the first two instructions.
+ */
+ *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+
+ /* All programs must keep CTX in callee saved BPF_REG_CTX.
+ * In eBPF case it's done by the compiler, here we need to
+ * do this ourself. Initial CTX is present in BPF_REG_ARG1.
+ */
+ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ } else {
+ new_insn += 3;
+ }
for (i = 0; i < len; fp++, i++) {
struct bpf_insn tmp_insns[6] = { };
@@ -1165,17 +1173,32 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
return 0;
}
-/**
- * sk_attach_filter - attach a socket filter
- * @fprog: the filter program
- * @sk: the socket to use
- *
- * Attach the user's filter code. We first run some sanity checks on
- * it to make sure it does not explode on us later. If an error
- * occurs or there is insufficient memory for the filter a negative
- * errno code is returned. On success the return is zero.
- */
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct bpf_prog *old_prog;
+ int err;
+
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ return -ENOMEM;
+
+ if (sk_unhashed(sk)) {
+ err = reuseport_alloc(sk);
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
+ old_prog = reuseport_attach_prog(sk, prog);
+ if (old_prog)
+ bpf_prog_destroy(old_prog);
+
+ return 0;
+}
+
+static
+struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
unsigned int fsize = bpf_classic_proglen(fprog);
unsigned int bpf_fsize = bpf_prog_size(fprog->len);
@@ -1183,19 +1206,19 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
- return -EPERM;
+ return ERR_PTR(-EPERM);
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
__bpf_prog_free(prog);
- return -EFAULT;
+ return ERR_PTR(-EFAULT);
}
prog->len = fprog->len;
@@ -1203,13 +1226,30 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
__bpf_prog_free(prog);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- prog = bpf_prepare_filter(prog, NULL);
+ return bpf_prepare_filter(prog, NULL);
+}
+
+/**
+ * sk_attach_filter - attach a socket filter
+ * @fprog: the filter program
+ * @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
+ int err;
+
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -1223,23 +1263,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
-int sk_attach_bpf(u32 ufd, struct sock *sk)
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
- struct bpf_prog *prog;
+ struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __reuseport_attach_prog(prog, sk);
+ if (err < 0) {
+ __bpf_prog_release(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog;
+
if (sock_flag(sk, SOCK_FILTER_LOCKED))
- return -EPERM;
+ return ERR_PTR(-EPERM);
prog = bpf_prog_get(ufd);
if (IS_ERR(prog))
- return PTR_ERR(prog);
+ return prog;
if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
+ return prog;
+}
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
err = __sk_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
@@ -1249,7 +1316,24 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
-#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __reuseport_attach_prog(prog, sk);
+ if (err < 0) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+#define BPF_LDST_LEN 16U
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
@@ -1257,9 +1341,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
int offset = (int) r2;
void *from = (void *) (long) r3;
unsigned int len = (unsigned int) r4;
- char buf[16];
+ char buf[BPF_LDST_LEN];
void *ptr;
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+ return -EINVAL;
+
/* bpf verifier guarantees that:
* 'from' pointer points to bpf program stack
* 'len' bytes of it were initialized
@@ -1279,7 +1366,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
if (unlikely(!ptr))
return -EFAULT;
- if (BPF_RECOMPUTE_CSUM(flags))
+ if (flags & BPF_F_RECOMPUTE_CSUM)
skb_postpull_rcsum(skb, ptr, len);
memcpy(ptr, from, len);
@@ -1288,8 +1375,9 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
/* skb_store_bits cannot return -EFAULT here */
skb_store_bits(skb, offset, ptr, len);
- if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
+ if (flags & BPF_F_RECOMPUTE_CSUM)
+ skb_postpush_rcsum(skb, ptr, len);
+
return 0;
}
@@ -1304,8 +1392,35 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.arg5_type = ARG_ANYTHING,
};
-#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f)
-#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10)
+static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
+ int offset = (int) r2;
+ void *to = (void *)(unsigned long) r3;
+ unsigned int len = (unsigned int) r4;
+ void *ptr;
+
+ if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, len, to);
+ if (unlikely(!ptr))
+ return -EFAULT;
+ if (ptr != to)
+ memcpy(to, ptr, len);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+ .func = bpf_skb_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE,
+};
static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
@@ -1313,6 +1428,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
int offset = (int) r2;
__sum16 sum, *ptr;
+ if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
@@ -1324,7 +1441,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
if (unlikely(!ptr))
return -EFAULT;
- switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
case 2:
csum_replace2(ptr, from, to);
break;
@@ -1356,10 +1473,12 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = {
static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
- bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags);
+ bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
int offset = (int) r2;
__sum16 sum, *ptr;
+ if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
@@ -1371,7 +1490,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
if (unlikely(!ptr))
return -EFAULT;
- switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
break;
@@ -1400,13 +1519,14 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1)
-
static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
struct net_device *dev;
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return -EINVAL;
+
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
if (unlikely(!dev))
return -EINVAL;
@@ -1415,8 +1535,12 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!skb2))
return -ENOMEM;
- if (BPF_IS_REDIRECT_INGRESS(flags))
+ if (flags & BPF_F_INGRESS) {
+ if (skb_at_tc_ingress(skb2))
+ skb_postpush_rcsum(skb2, skb_mac_header(skb2),
+ skb2->mac_len);
return dev_forward_skb(dev, skb2);
+ }
skb2->dev = dev;
skb_sender_cpu_clear(skb2);
@@ -1438,12 +1562,17 @@ struct redirect_info {
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+
static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return TC_ACT_SHOT;
+
ri->ifindex = ifindex;
ri->flags = flags;
+
return TC_ACT_REDIRECT;
}
@@ -1459,8 +1588,12 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
- if (BPF_IS_REDIRECT_INGRESS(ri->flags))
+ if (ri->flags & BPF_F_INGRESS) {
+ if (skb_at_tc_ingress(skb))
+ skb_postpush_rcsum(skb, skb_mac_header(skb),
+ skb->mac_len);
return dev_forward_skb(dev, skb);
+ }
skb->dev = dev;
skb_sender_cpu_clear(skb);
@@ -1552,19 +1685,49 @@ bool bpf_helper_changes_skb_data(void *func)
return false;
}
+static unsigned short bpf_tunnel_key_af(u64 flags)
+{
+ return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
- struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
- if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
- return -EINVAL;
- if (ip_tunnel_info_af(info) != AF_INET)
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6))))
return -EINVAL;
+ if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags))
+ return -EPROTO;
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+ to = (struct bpf_tunnel_key *)compat;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
to->tunnel_id = be64_to_cpu(info->key.tun_id);
- to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ to->tunnel_tos = info->key.tos;
+ to->tunnel_ttl = info->key.ttl;
+
+ if (flags & BPF_F_TUNINFO_IPV6)
+ memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
+ sizeof(to->remote_ipv6));
+ else
+ to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key)))
+ memcpy((void *)(long) r2, to, size);
return 0;
}
@@ -1586,10 +1749,25 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
struct metadata_dst *md = this_cpu_ptr(md_dst);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
- if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags))
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
return -EINVAL;
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ memcpy(compat, from, size);
+ memset(compat + size, 0, sizeof(compat) - size);
+ from = (struct bpf_tunnel_key *)compat;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
skb_dst_drop(skb);
dst_hold((struct dst_entry *) md);
@@ -1597,9 +1775,21 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info = &md->u.tun_info;
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY;
+
+ info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
info->key.tun_id = cpu_to_be64(from->tunnel_id);
- info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ info->key.tos = from->tunnel_tos;
+ info->key.ttl = from->tunnel_ttl;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ info->mode |= IP_TUNNEL_INFO_IPV6;
+ memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
+ sizeof(from->remote_ipv6));
+ } else {
+ info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ if (flags & BPF_F_ZERO_CSUM_TX)
+ info->key.tun_flags &= ~TUNNEL_CSUM;
+ }
return 0;
}
@@ -1659,6 +1849,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
switch (func_id) {
case BPF_FUNC_skb_store_bytes:
return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f88a62ab0..b6c8a6629 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -471,6 +471,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
if (dev_isalive(netdev)) {
struct switchdev_attr attr = {
+ .orig_dev = netdev,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
.flags = SWITCHDEV_F_NO_RECURSE,
};
@@ -1452,8 +1453,8 @@ static void netdev_release(struct device *d)
static const void *net_namespace(struct device *d)
{
- struct net_device *dev;
- dev = container_of(d, struct net_device, dev);
+ struct net_device *dev = to_net_dev(d);
+
return dev_net(dev);
}
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index adef015b2..92da5e4ce 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -32,6 +32,10 @@
#include <trace/events/sock.h>
#include <trace/events/udp.h>
#include <trace/events/fib.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index d9ee8d08a..0260c84ed 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -61,9 +61,12 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
int err;
struct socket *sock = sock_from_file(file, &err);
- if (sock)
- sock->sk->sk_classid = (u32)(unsigned long)v;
-
+ if (sock) {
+ spin_lock(&cgroup_sk_update_lock);
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
+ (unsigned long)v);
+ spin_unlock(&cgroup_sk_update_lock);
+ }
return 0;
}
@@ -100,6 +103,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
{
struct cgroup_cls_state *cs = css_cls_state(css);
+ cgroup_sk_alloc_disable();
+
cs->classid = (u32)value;
update_classid(css, (void *)(unsigned long)cs->classid);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 40fd09fe0..f1efbc39e 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -27,6 +27,12 @@
#include <linux/fdtable.h>
+/*
+ * netprio allocates per-net_device priomap array which is indexed by
+ * css->id. Limiting css ID to 16bits doesn't lose anything.
+ */
+#define NETPRIO_ID_MAX USHRT_MAX
+
#define PRIOMAP_MIN_SZ 128
/*
@@ -144,6 +150,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
struct net_device *dev;
int ret = 0;
+ if (css->id > NETPRIO_ID_MAX)
+ return -ENOSPC;
+
if (!parent_css)
return 0;
@@ -200,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
+ cgroup_sk_alloc_disable();
+
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -213,8 +224,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
- if (sock)
- sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+ if (sock) {
+ spin_lock(&cgroup_sk_update_lock);
+ sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+ (unsigned long)v);
+ spin_unlock(&cgroup_sk_update_lock);
+ }
return 0;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4da4d51a2..1474cfd2d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2900,7 +2900,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
if (!(pkt_dev->flags & F_UDPCSUM)) {
skb->ip_summed = CHECKSUM_NONE;
- } else if (odev->features & NETIF_F_V4_CSUM) {
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
udp4_hwcsum(skb, iph->saddr, iph->daddr);
@@ -3034,7 +3034,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
if (!(pkt_dev->flags & F_UDPCSUM)) {
skb->ip_summed = CHECKSUM_NONE;
- } else if (odev->features & NETIF_F_V6_CSUM) {
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 34ba7a088..8261d95dd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1027,6 +1027,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
int err;
struct switchdev_attr attr = {
+ .orig_dev = dev,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
.flags = SWITCHDEV_F_NO_RECURSE,
};
@@ -2563,7 +2564,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
struct net_device *dev,
u8 *addr, u16 vid, u32 pid, u32 seq,
int type, unsigned int flags,
- int nlflags)
+ int nlflags, u16 ndm_state)
{
struct nlmsghdr *nlh;
struct ndmsg *ndm;
@@ -2579,7 +2580,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
ndm->ndm_flags = flags;
ndm->ndm_type = 0;
ndm->ndm_ifindex = dev->ifindex;
- ndm->ndm_state = NUD_PERMANENT;
+ ndm->ndm_state = ndm_state;
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
goto nla_put_failure;
@@ -2600,7 +2601,8 @@ static inline size_t rtnl_fdb_nlmsg_size(void)
return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
}
-static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type)
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
+ u16 ndm_state)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -2611,7 +2613,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type)
goto errout;
err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
- 0, 0, type, NTF_SELF, 0);
+ 0, 0, type, NTF_SELF, 0, ndm_state);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -2746,7 +2748,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
nlh->nlmsg_flags);
if (!err) {
- rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH);
+ rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
+ ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -2847,7 +2850,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
if (!err) {
- rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH);
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
+ ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -2875,7 +2879,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
portid, seq,
RTM_NEWNEIGH, NTF_SELF,
- NLM_F_MULTI);
+ NLM_F_MULTI, NUD_PERMANENT);
if (err < 0)
return err;
skip:
@@ -2907,6 +2911,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
+ cb->args[1] = err;
return idx;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -2940,6 +2945,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
+ cb->args[1] = 0;
for_each_netdev(net, dev) {
if (brport_idx && (dev->ifindex != brport_idx))
continue;
@@ -2967,12 +2973,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
idx);
}
+ if (cb->args[1] == -EMSGSIZE)
+ break;
if (dev->netdev_ops->ndo_fdb_dump)
idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
idx);
else
idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+ if (cb->args[1] == -EMSGSIZE)
+ break;
cops = NULL;
}
@@ -3347,7 +3357,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
rtnl_doit_func doit;
- int sz_idx, kind;
+ int kind;
int family;
int type;
int err;
@@ -3363,7 +3373,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return 0;
family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
- sz_idx = type>>2;
kind = type&3;
if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
diff --git a/net/core/scm.c b/net/core/scm.c
index dce0acb92..2696aefdc 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -295,8 +295,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
/* Bump the usage count and install the file. */
sock = sock_from_file(fp[i], &err);
if (sock) {
- sock_update_netprioidx(sock->sk);
- sock_update_classid(sock->sk);
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
}
fd_install(new_fd, get_file(fp[i]));
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5bf88f58b..8616d1147 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2948,6 +2948,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
EXPORT_SYMBOL_GPL(skb_append_pagefrags);
/**
+ * skb_push_rcsum - push skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_push on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_push unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
+{
+ skb_push(skb, len);
+ skb_postpush_rcsum(skb, skb->data, len);
+ return skb->data;
+}
+
+/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
* @len: length of data pulled
@@ -4084,9 +4102,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
if (!pskb_may_pull(skb_chk, offset))
goto err;
- __skb_pull(skb_chk, offset);
+ skb_pull_rcsum(skb_chk, offset);
ret = skb_chkf(skb_chk);
- __skb_push(skb_chk, offset);
+ skb_push_rcsum(skb_chk, offset);
if (ret)
goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 0d91f7dca..6c1c8bc93 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -134,6 +134,7 @@
#include <linux/sock_diag.h>
#include <linux/filter.h>
+#include <net/sock_reuseport.h>
#include <trace/events/sock.h>
@@ -194,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap)
}
EXPORT_SYMBOL(sk_net_capable);
-
-#ifdef CONFIG_MEMCG_KMEM
-int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- struct proto *proto;
- int ret = 0;
-
- mutex_lock(&proto_list_mutex);
- list_for_each_entry(proto, &proto_list, node) {
- if (proto->init_cgroup) {
- ret = proto->init_cgroup(memcg, ss);
- if (ret)
- goto out;
- }
- }
-
- mutex_unlock(&proto_list_mutex);
- return ret;
-out:
- list_for_each_entry_continue_reverse(proto, &proto_list, node)
- if (proto->destroy_cgroup)
- proto->destroy_cgroup(memcg);
- mutex_unlock(&proto_list_mutex);
- return ret;
-}
-
-void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
-{
- struct proto *proto;
-
- mutex_lock(&proto_list_mutex);
- list_for_each_entry_reverse(proto, &proto_list, node)
- if (proto->destroy_cgroup)
- proto->destroy_cgroup(memcg);
- mutex_unlock(&proto_list_mutex);
-}
-#endif
-
/*
* Each address family might have different locking rules, so we have
* one slock key per address family:
@@ -239,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
-#if defined(CONFIG_MEMCG_KMEM)
-struct static_key memcg_socket_limit_enabled;
-EXPORT_SYMBOL(memcg_socket_limit_enabled);
-#endif
-
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
@@ -932,6 +890,32 @@ set_rcvbuf:
}
break;
+ case SO_ATTACH_REUSEPORT_CBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_reuseport_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_ATTACH_REUSEPORT_EBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_reuseport_attach_bpf(ufd, sk);
+ }
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -1362,6 +1346,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
+ cgroup_sk_alloc(&sk->sk_cgrp_data);
}
return sk;
@@ -1384,6 +1369,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
owner = prot->owner;
slab = prot->slab;
+ cgroup_sk_free(&sk->sk_cgrp_data);
security_sk_free(sk);
if (slab != NULL)
kmem_cache_free(slab, sk);
@@ -1392,17 +1378,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
module_put(owner);
}
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-void sock_update_netprioidx(struct sock *sk)
-{
- if (in_interrupt())
- return;
-
- sk->sk_cgrp_prioidx = task_netprioidx(current);
-}
-EXPORT_SYMBOL_GPL(sock_update_netprioidx);
-#endif
-
/**
* sk_alloc - All socket objects are allocated here
* @net: the applicable net namespace
@@ -1431,8 +1406,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
- sock_update_classid(sk);
- sock_update_netprioidx(sk);
+ sock_update_classid(&sk->sk_cgrp_data);
+ sock_update_netprioidx(&sk->sk_cgrp_data);
}
return sk;
@@ -1452,6 +1427,8 @@ void sk_destruct(struct sock *sk)
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
@@ -1487,12 +1464,6 @@ void sk_free(struct sock *sk)
}
EXPORT_SYMBOL(sk_free);
-static void sk_update_clone(const struct sock *sk, struct sock *newsk)
-{
- if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
- sock_update_memcg(newsk);
-}
-
/**
* sk_clone_lock - clone a socket, and lock its clone
* @sk: the socket to clone
@@ -1587,7 +1558,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
- sk_update_clone(sk, newsk);
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ sock_update_memcg(newsk);
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
@@ -2069,27 +2041,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
struct proto *prot = sk->sk_prot;
int amt = sk_mem_pages(size);
long allocated;
- int parent_status = UNDER_LIMIT;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- allocated = sk_memory_allocated_add(sk, amt, &parent_status);
+ allocated = sk_memory_allocated_add(sk, amt);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+ !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
+ goto suppress_allocation;
/* Under limit. */
- if (parent_status == UNDER_LIMIT &&
- allocated <= sk_prot_mem_limits(sk, 0)) {
+ if (allocated <= sk_prot_mem_limits(sk, 0)) {
sk_leave_memory_pressure(sk);
return 1;
}
- /* Under pressure. (we or our parents) */
- if ((parent_status > SOFT_LIMIT) ||
- allocated > sk_prot_mem_limits(sk, 1))
+ /* Under pressure. */
+ if (allocated > sk_prot_mem_limits(sk, 1))
sk_enter_memory_pressure(sk);
- /* Over hard limit (we or our parents) */
- if ((parent_status == OVER_LIMIT) ||
- (allocated > sk_prot_mem_limits(sk, 2)))
+ /* Over hard limit. */
+ if (allocated > sk_prot_mem_limits(sk, 2))
goto suppress_allocation;
/* guarantee minimum buffer size under pressure */
@@ -2138,6 +2110,9 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
+
return 0;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2153,6 +2128,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
sk_memory_allocated_sub(sk, amount);
sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+
if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
@@ -2281,7 +2259,7 @@ static void sock_def_wakeup(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_all(&wq->wait);
rcu_read_unlock();
}
@@ -2292,7 +2270,7 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, POLLERR);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
rcu_read_unlock();
@@ -2304,7 +2282,7 @@ static void sock_def_readable(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -2322,7 +2300,7 @@ static void sock_def_write_space(struct sock *sk)
*/
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 0c1d58d43..a996ce8c8 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -214,7 +214,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld)
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
-static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int err;
struct sock_diag_req *req = nlmsg_data(nlh);
@@ -234,8 +234,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
hndl = sock_diag_handlers[req->sdiag_family];
if (hndl == NULL)
err = -ENOENT;
- else
+ else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
+ else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
+ err = hndl->destroy(skb, nlh);
+ else
+ err = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
return err;
@@ -261,7 +265,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return ret;
case SOCK_DIAG_BY_FAMILY:
- return __sock_diag_rcv_msg(skb, nlh);
+ case SOCK_DESTROY:
+ return __sock_diag_cmd(skb, nlh);
default:
return -EINVAL;
}
@@ -295,6 +300,18 @@ static int sock_diag_bind(struct net *net, int group)
return 0;
}
+int sock_diag_destroy(struct sock *sk, int err)
+{
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!sk->sk_prot->diag_destroy)
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, err);
+}
+EXPORT_SYMBOL_GPL(sock_diag_destroy);
+
static int __net_init diag_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
new file mode 100644
index 000000000..e92b759d9
--- /dev/null
+++ b/net/core/sock_reuseport.c
@@ -0,0 +1,258 @@
+/*
+ * To speed up listener socket lookup, create an array to store all sockets
+ * listening on the same port. This allows a decision to be made after finding
+ * the first socket. An optional BPF program can also be configured for
+ * selecting the socket index from the array of available sockets.
+ */
+
+#include <net/sock_reuseport.h>
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+#define INIT_SOCKS 128
+
+static DEFINE_SPINLOCK(reuseport_lock);
+
+static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
+{
+ size_t size = sizeof(struct sock_reuseport) +
+ sizeof(struct sock *) * max_socks;
+ struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+
+ if (!reuse)
+ return NULL;
+
+ reuse->max_socks = max_socks;
+
+ RCU_INIT_POINTER(reuse->prog, NULL);
+ return reuse;
+}
+
+int reuseport_alloc(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+
+ /* bh lock used since this function call may precede hlist lock in
+ * soft irq of receive path or setsockopt from process context
+ */
+ spin_lock_bh(&reuseport_lock);
+ WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock)),
+ "multiple allocations for the same socket");
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+
+ reuse->socks[0] = sk;
+ reuse->num_socks = 1;
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_alloc);
+
+static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
+{
+ struct sock_reuseport *more_reuse;
+ u32 more_socks_size, i;
+
+ more_socks_size = reuse->max_socks * 2U;
+ if (more_socks_size > U16_MAX)
+ return NULL;
+
+ more_reuse = __reuseport_alloc(more_socks_size);
+ if (!more_reuse)
+ return NULL;
+
+ more_reuse->max_socks = more_socks_size;
+ more_reuse->num_socks = reuse->num_socks;
+ more_reuse->prog = reuse->prog;
+
+ memcpy(more_reuse->socks, reuse->socks,
+ reuse->num_socks * sizeof(struct sock *));
+
+ for (i = 0; i < reuse->num_socks; ++i)
+ rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
+ more_reuse);
+
+ /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
+ * that reuse and more_reuse can temporarily share a reference
+ * to prog.
+ */
+ kfree_rcu(reuse, rcu);
+ return more_reuse;
+}
+
+/**
+ * reuseport_add_sock - Add a socket to the reuseport group of another.
+ * @sk: New socket to add to the group.
+ * @sk2: Socket belonging to the existing reuseport group.
+ * May return ENOMEM and not add socket to group under memory pressure.
+ */
+int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+{
+ struct sock_reuseport *reuse;
+
+ if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
+ int err = reuseport_alloc(sk2);
+
+ if (err)
+ return err;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock)),
+ WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock)),
+ "socket already in reuseport group");
+
+ if (reuse->num_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+ }
+
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_select_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_add_sock);
+
+static void reuseport_free_rcu(struct rcu_head *head)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = container_of(head, struct sock_reuseport, rcu);
+ if (reuse->prog)
+ bpf_prog_destroy(reuse->prog);
+ kfree(reuse);
+}
+
+void reuseport_detach_sock(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ int i;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+
+ for (i = 0; i < reuse->num_socks; i++) {
+ if (reuse->socks[i] == sk) {
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+ if (reuse->num_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+ break;
+ }
+ }
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_detach_sock);
+
+static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sk_buff *nskb = NULL;
+ u32 index;
+
+ if (skb_shared(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+ skb = nskb;
+ }
+
+ /* temporarily advance data past protocol header */
+ if (!pskb_pull(skb, hdr_len)) {
+ kfree_skb(nskb);
+ return NULL;
+ }
+ index = bpf_prog_run_save_cb(prog, skb);
+ __skb_push(skb, hdr_len);
+
+ consume_skb(nskb);
+
+ if (index >= socks)
+ return NULL;
+
+ return reuse->socks[index];
+}
+
+/**
+ * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: First socket in the group.
+ * @hash: When no BPF filter is available, use this hash to select.
+ * @skb: skb to run through BPF filter.
+ * @hdr_len: BPF filter expects skb data pointer at payload data. If
+ * the skb does not yet point at the payload, this parameter represents
+ * how far the pointer needs to advance to reach the payload.
+ * Returns a socket that should receive the packet (or NULL on error).
+ */
+struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+ struct sock *sk2 = NULL;
+ u16 socks;
+
+ rcu_read_lock();
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+
+ /* if memory allocation failed or add call is not yet complete */
+ if (!reuse)
+ goto out;
+
+ prog = rcu_dereference(reuse->prog);
+ socks = READ_ONCE(reuse->num_socks);
+ if (likely(socks)) {
+ /* paired with smp_wmb() in reuseport_add_sock() */
+ smp_rmb();
+
+ if (prog && skb)
+ sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ else
+ sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ }
+
+out:
+ rcu_read_unlock();
+ return sk2;
+}
+EXPORT_SYMBOL(reuseport_select_sock);
+
+struct bpf_prog *
+reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(reuse->prog, prog);
+ spin_unlock_bh(&reuseport_lock);
+
+ return old_prog;
+}
+EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/core/stream.c b/net/core/stream.c
index b96f7a79e..159516a11 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -35,7 +35,7 @@ void sk_stream_write_space(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))