From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Wed, 5 Aug 2015 17:04:01 -0300 Subject: Initial import --- net/openvswitch/Kconfig | 67 + net/openvswitch/Makefile | 20 + net/openvswitch/actions.c | 995 +++++++++++++++ net/openvswitch/datapath.c | 2333 ++++++++++++++++++++++++++++++++++ net/openvswitch/datapath.h | 205 +++ net/openvswitch/dp_notify.c | 102 ++ net/openvswitch/flow.c | 730 +++++++++++ net/openvswitch/flow.h | 284 +++++ net/openvswitch/flow_netlink.c | 2309 +++++++++++++++++++++++++++++++++ net/openvswitch/flow_netlink.h | 73 ++ net/openvswitch/flow_table.c | 778 ++++++++++++ net/openvswitch/flow_table.h | 90 ++ net/openvswitch/vport-geneve.c | 274 ++++ net/openvswitch/vport-gre.c | 313 +++++ net/openvswitch/vport-internal_dev.c | 288 +++++ net/openvswitch/vport-internal_dev.h | 30 + net/openvswitch/vport-netdev.c | 246 ++++ net/openvswitch/vport-netdev.h | 47 + net/openvswitch/vport-vxlan.c | 322 +++++ net/openvswitch/vport-vxlan.h | 11 + net/openvswitch/vport.c | 627 +++++++++ net/openvswitch/vport.h | 259 ++++ 22 files changed, 10403 insertions(+) create mode 100644 net/openvswitch/Kconfig create mode 100644 net/openvswitch/Makefile create mode 100644 net/openvswitch/actions.c create mode 100644 net/openvswitch/datapath.c create mode 100644 net/openvswitch/datapath.h create mode 100644 net/openvswitch/dp_notify.c create mode 100644 net/openvswitch/flow.c create mode 100644 net/openvswitch/flow.h create mode 100644 net/openvswitch/flow_netlink.c create mode 100644 net/openvswitch/flow_netlink.h create mode 100644 net/openvswitch/flow_table.c create mode 100644 net/openvswitch/flow_table.h create mode 100644 net/openvswitch/vport-geneve.c create mode 100644 net/openvswitch/vport-gre.c create mode 100644 net/openvswitch/vport-internal_dev.c create mode 100644 net/openvswitch/vport-internal_dev.h create mode 100644 net/openvswitch/vport-netdev.c create mode 100644 net/openvswitch/vport-netdev.h create mode 100644 net/openvswitch/vport-vxlan.c create mode 100644 net/openvswitch/vport-vxlan.h create mode 100644 net/openvswitch/vport.c create mode 100644 net/openvswitch/vport.h (limited to 'net/openvswitch') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig new file mode 100644 index 000000000..ed6b0f8dd --- /dev/null +++ b/net/openvswitch/Kconfig @@ -0,0 +1,67 @@ +# +# Open vSwitch +# + +config OPENVSWITCH + tristate "Open vSwitch" + depends on INET + select LIBCRC32C + select MPLS + select NET_MPLS_GSO + ---help--- + Open vSwitch is a multilayer Ethernet switch targeted at virtualized + environments. In addition to supporting a variety of features + expected in a traditional hardware switch, it enables fine-grained + programmatic extension and flow-based control of the network. This + control is useful in a wide variety of applications but is + particularly important in multi-server virtualization deployments, + which are often characterized by highly dynamic endpoints and the + need to maintain logical abstractions for multiple tenants. + + The Open vSwitch datapath provides an in-kernel fast path for packet + forwarding. It is complemented by a userspace daemon, ovs-vswitchd, + which is able to accept configuration from a variety of sources and + translate it into packet processing rules. + + See http://openvswitch.org for more information and userspace + utilities. + + To compile this code as a module, choose M here: the module will be + called openvswitch. + + If unsure, say N. + +config OPENVSWITCH_GRE + tristate "Open vSwitch GRE tunneling support" + depends on OPENVSWITCH + depends on NET_IPGRE_DEMUX + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create GRE + vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. + +config OPENVSWITCH_VXLAN + tristate "Open vSwitch VXLAN tunneling support" + depends on OPENVSWITCH + depends on VXLAN + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. + +config OPENVSWITCH_GENEVE + tristate "Open vSwitch Geneve tunneling support" + depends on OPENVSWITCH + depends on GENEVE + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile new file mode 100644 index 000000000..91b947841 --- /dev/null +++ b/net/openvswitch/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for Open vSwitch. +# + +obj-$(CONFIG_OPENVSWITCH) += openvswitch.o + +openvswitch-y := \ + actions.o \ + datapath.o \ + dp_notify.o \ + flow.o \ + flow_netlink.o \ + flow_table.o \ + vport.o \ + vport-internal_dev.o \ + vport-netdev.o + +obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o +obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o +obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c new file mode 100644 index 000000000..b491c1c29 --- /dev/null +++ b/net/openvswitch/actions.c @@ -0,0 +1,995 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "flow.h" +#include "vport.h" + +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, int len); + +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(const struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + const struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} + +static void invalidate_flow_key(struct sw_flow_key *key) +{ + key->eth.type = htons(0); +} + +static bool is_flow_key_valid(const struct sw_flow_key *key) +{ + return !!key->eth.type; +} + +static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_mpls *mpls) +{ + __be32 *new_mpls_lse; + struct ethhdr *hdr; + + /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -ENOTSUPP; + + if (skb_cow_head(skb, MPLS_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + + new_mpls_lse = (__be32 *)skb_mpls_header(skb); + *new_mpls_lse = mpls->mpls_lse; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, + MPLS_HLEN, 0)); + + hdr = eth_hdr(skb); + hdr->h_proto = mpls->mpls_ethertype; + + if (!skb->inner_protocol) + skb_set_inner_protocol(skb, skb->protocol); + skb->protocol = mpls->mpls_ethertype; + + invalidate_flow_key(key); + return 0; +} + +static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const __be16 ethertype) +{ + struct ethhdr *hdr; + int err; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); + + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + + /* skb_mpls_header() is used to locate the ethertype + * field correctly in the presence of VLAN tags. + */ + hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr->h_proto = ethertype; + if (eth_p_mpls(skb->protocol)) + skb->protocol = ethertype; + + invalidate_flow_key(key); + return 0; +} + +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK)) + +static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, + const __be32 *mpls_lse, const __be32 *mask) +{ + __be32 *stack; + __be32 lse; + int err; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + stack = (__be32 *)skb_mpls_header(skb); + lse = MASKED(*stack, *mpls_lse, *mask); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), lse }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + *stack = lse; + flow_key->mpls.top_lse = lse; + return 0; +} + +static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + err = skb_vlan_pop(skb); + if (skb_vlan_tag_present(skb)) + invalidate_flow_key(key); + else + key->eth.tci = 0; + return err; +} + +static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_vlan *vlan) +{ + if (skb_vlan_tag_present(skb)) + invalidate_flow_key(key); + else + key->eth.tci = vlan->vlan_tci; + return skb_vlan_push(skb, vlan->vlan_tpid, + ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); +} + +/* 'src' is already properly masked. */ +static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) +{ + u16 *dst = (u16 *)dst_; + const u16 *src = (const u16 *)src_; + const u16 *mask = (const u16 *)mask_; + + SET_MASKED(dst[0], src[0], mask[0]); + SET_MASKED(dst[1], src[1], mask[1]); + SET_MASKED(dst[2], src[2], mask[2]); +} + +static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ethernet *key, + const struct ovs_key_ethernet *mask) +{ + int err; + + err = skb_ensure_writable(skb, ETH_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + + ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src, + mask->eth_src); + ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, + mask->eth_dst); + + ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + + ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); + ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); + return 0; +} + +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (nh->protocol == IPPROTO_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, + *addr, new_addr, 1); + } else if (nh->protocol == IPPROTO_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&uh->check, skb, + *addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } + + csum_replace4(&nh->check, *addr, new_addr); + skb_clear_hash(skb); + *addr = new_addr; +} + +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4]) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (l4_proto == NEXTHDR_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, + addr, new_addr, 1); + } else if (l4_proto == NEXTHDR_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&uh->check, skb, + addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } else if (l4_proto == NEXTHDR_ICMP) { + if (likely(transport_len >= sizeof(struct icmp6hdr))) + inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, + skb, addr, new_addr, 1); + } +} + +static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], + const __be32 mask[4], __be32 masked[4]) +{ + masked[0] = MASKED(old[0], addr[0], mask[0]); + masked[1] = MASKED(old[1], addr[1], mask[1]); + masked[2] = MASKED(old[2], addr[2], mask[2]); + masked[3] = MASKED(old[3], addr[3], mask[3]); +} + +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4], + bool recalculate_csum) +{ + if (recalculate_csum) + update_ipv6_checksum(skb, l4_proto, addr, new_addr); + + skb_clear_hash(skb); + memcpy(addr, new_addr, sizeof(__be32[4])); +} + +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +{ + /* Bits 21-24 are always unmasked, so this retains their values. */ + SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); +} + +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, + u8 mask) +{ + new_ttl = MASKED(nh->ttl, new_ttl, mask); + + csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); + nh->ttl = new_ttl; +} + +static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv4 *key, + const struct ovs_key_ipv4 *mask) +{ + struct iphdr *nh; + __be32 new_addr; + int err; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (mask->ipv4_src) { + new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); + + if (unlikely(new_addr != nh->saddr)) { + set_ip_addr(skb, nh, &nh->saddr, new_addr); + flow_key->ipv4.addr.src = new_addr; + } + } + if (mask->ipv4_dst) { + new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); + + if (unlikely(new_addr != nh->daddr)) { + set_ip_addr(skb, nh, &nh->daddr, new_addr); + flow_key->ipv4.addr.dst = new_addr; + } + } + if (mask->ipv4_tos) { + ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos); + flow_key->ip.tos = nh->tos; + } + if (mask->ipv4_ttl) { + set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl); + flow_key->ip.ttl = nh->ttl; + } + + return 0; +} + +static bool is_ipv6_mask_nonzero(const __be32 addr[4]) +{ + return !!(addr[0] | addr[1] | addr[2] | addr[3]); +} + +static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv6 *key, + const struct ovs_key_ipv6 *mask) +{ + struct ipv6hdr *nh; + int err; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (is_ipv6_mask_nonzero(mask->ipv6_src)) { + __be32 *saddr = (__be32 *)&nh->saddr; + __be32 masked[4]; + + mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); + + if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { + set_ipv6_addr(skb, key->ipv6_proto, saddr, masked, + true); + memcpy(&flow_key->ipv6.addr.src, masked, + sizeof(flow_key->ipv6.addr.src)); + } + } + if (is_ipv6_mask_nonzero(mask->ipv6_dst)) { + unsigned int offset = 0; + int flags = IP6_FH_F_SKIP_RH; + bool recalc_csum = true; + __be32 *daddr = (__be32 *)&nh->daddr; + __be32 masked[4]; + + mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked); + + if (unlikely(memcmp(daddr, masked, sizeof(masked)))) { + if (ipv6_ext_hdr(nh->nexthdr)) + recalc_csum = (ipv6_find_hdr(skb, &offset, + NEXTHDR_ROUTING, + NULL, &flags) + != NEXTHDR_ROUTING); + + set_ipv6_addr(skb, key->ipv6_proto, daddr, masked, + recalc_csum); + memcpy(&flow_key->ipv6.addr.dst, masked, + sizeof(flow_key->ipv6.addr.dst)); + } + } + if (mask->ipv6_tclass) { + ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + flow_key->ip.tos = ipv6_get_dsfield(nh); + } + if (mask->ipv6_label) { + set_ipv6_fl(nh, ntohl(key->ipv6_label), + ntohl(mask->ipv6_label)); + flow_key->ipv6.label = + *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + } + if (mask->ipv6_hlimit) { + SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit); + flow_key->ip.ttl = nh->hop_limit; + } + return 0; +} + +/* Must follow skb_ensure_writable() since that can move the skb data. */ +static void set_tp_port(struct sk_buff *skb, __be16 *port, + __be16 new_port, __sum16 *check) +{ + inet_proto_csum_replace2(check, skb, *port, new_port, 0); + *port = new_port; +} + +static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_udp *key, + const struct ovs_key_udp *mask) +{ + struct udphdr *uh; + __be16 src, dst; + int err; + + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); + if (unlikely(err)) + return err; + + uh = udp_hdr(skb); + /* Either of the masks is non-zero, so do not bother checking them. */ + src = MASKED(uh->source, key->udp_src, mask->udp_src); + dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst); + + if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { + if (likely(src != uh->source)) { + set_tp_port(skb, &uh->source, src, &uh->check); + flow_key->tp.src = src; + } + if (likely(dst != uh->dest)) { + set_tp_port(skb, &uh->dest, dst, &uh->check); + flow_key->tp.dst = dst; + } + + if (unlikely(!uh->check)) + uh->check = CSUM_MANGLED_0; + } else { + uh->source = src; + uh->dest = dst; + flow_key->tp.src = src; + flow_key->tp.dst = dst; + } + + skb_clear_hash(skb); + + return 0; +} + +static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_tcp *key, + const struct ovs_key_tcp *mask) +{ + struct tcphdr *th; + __be16 src, dst; + int err; + + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr)); + if (unlikely(err)) + return err; + + th = tcp_hdr(skb); + src = MASKED(th->source, key->tcp_src, mask->tcp_src); + if (likely(src != th->source)) { + set_tp_port(skb, &th->source, src, &th->check); + flow_key->tp.src = src; + } + dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + if (likely(dst != th->dest)) { + set_tp_port(skb, &th->dest, dst, &th->check); + flow_key->tp.dst = dst; + } + skb_clear_hash(skb); + + return 0; +} + +static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_sctp *key, + const struct ovs_key_sctp *mask) +{ + unsigned int sctphoff = skb_transport_offset(skb); + struct sctphdr *sh; + __le32 old_correct_csum, new_csum, old_csum; + int err; + + err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr)); + if (unlikely(err)) + return err; + + sh = sctp_hdr(skb); + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); + + sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + + new_csum = sctp_compute_cksum(skb, sctphoff); + + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + + skb_clear_hash(skb); + flow_key->tp.src = sh->source; + flow_key->tp.dst = sh->dest; + + return 0; +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +{ + struct vport *vport = ovs_vport_rcu(dp, out_port); + + if (likely(vport)) + ovs_vport_send(vport, skb); + else + kfree_skb(skb); +} + +static int output_userspace(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr) +{ + struct ovs_tunnel_info info; + struct dp_upcall_info upcall; + const struct nlattr *a; + int rem; + + upcall.cmd = OVS_PACKET_CMD_ACTION; + upcall.userdata = NULL; + upcall.portid = 0; + upcall.egress_tun_info = NULL; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_USERSPACE_ATTR_USERDATA: + upcall.userdata = a; + break; + + case OVS_USERSPACE_ATTR_PID: + upcall.portid = nla_get_u32(a); + break; + + case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { + /* Get out tunnel info. */ + struct vport *vport; + + vport = ovs_vport_rcu(dp, nla_get_u32(a)); + if (vport) { + int err; + + err = ovs_vport_get_egress_tun_info(vport, skb, + &info); + if (!err) + upcall.egress_tun_info = &info; + } + break; + } + + } /* End of switch. */ + } + + return ovs_dp_upcall(dp, skb, key, &upcall); +} + +static int sample(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr) +{ + const struct nlattr *acts_list = NULL; + const struct nlattr *a; + int rem; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (prandom_u32() >= nla_get_u32(a)) + return 0; + break; + + case OVS_SAMPLE_ATTR_ACTIONS: + acts_list = a; + break; + } + } + + rem = nla_len(acts_list); + a = nla_data(acts_list); + + /* Actions list is empty, do nothing */ + if (unlikely(!rem)) + return 0; + + /* The only known usage of sample action is having a single user-space + * action. Treat this usage as a special case. + * The output_userspace() should clone the skb to be sent to the + * user space. This skb will be consumed by its caller. + */ + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && + nla_is_last(a, rem))) + return output_userspace(dp, skb, key, a); + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + /* Skip the sample action when out of memory. */ + return 0; + + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; +} + +static int execute_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + /* Only tunnel set execution is supported without a mask. */ + if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { + OVS_CB(skb)->egress_tun_info = nla_data(a); + return 0; + } + + return -EINVAL; +} + +/* Mask is at the midpoint of the data. */ +#define get_mask(a, type) ((const type)nla_data(a) + 1) + +static int execute_masked_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + int err = 0; + + switch (nla_type(a)) { + case OVS_KEY_ATTR_PRIORITY: + SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *)); + flow_key->phy.priority = skb->priority; + break; + + case OVS_KEY_ATTR_SKB_MARK: + SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + flow_key->phy.skb_mark = skb->mark; + break; + + case OVS_KEY_ATTR_TUNNEL_INFO: + /* Masked data not supported for tunnel. */ + err = -EINVAL; + break; + + case OVS_KEY_ATTR_ETHERNET: + err = set_eth_addr(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ethernet *)); + break; + + case OVS_KEY_ATTR_IPV4: + err = set_ipv4(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv4 *)); + break; + + case OVS_KEY_ATTR_IPV6: + err = set_ipv6(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv6 *)); + break; + + case OVS_KEY_ATTR_TCP: + err = set_tcp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_tcp *)); + break; + + case OVS_KEY_ATTR_UDP: + err = set_udp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_udp *)); + break; + + case OVS_KEY_ATTR_SCTP: + err = set_sctp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_sctp *)); + break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, + __be32 *)); + break; + } + + return err; +} + +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + + if (!is_flow_key_valid(key)) { + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + BUG_ON(!is_flow_key_valid(key)); + + if (!nla_is_last(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + +/* Execute a list of actions against 'skb'. */ +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, int len) +{ + /* Every output action needs a separate clone of 'skb', but the common + * case is just a single output action, so that doing a clone and + * then freeing the original skbuff is wasteful. So the following code + * is slightly obscure just to avoid that. + */ + int prev_port = -1; + const struct nlattr *a; + int rem; + + for (a = attr, rem = len; rem > 0; + a = nla_next(a, &rem)) { + int err = 0; + + if (unlikely(prev_port != -1)) { + struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); + + if (out_skb) + do_output(dp, out_skb, prev_port); + + prev_port = -1; + } + + switch (nla_type(a)) { + case OVS_ACTION_ATTR_OUTPUT: + prev_port = nla_get_u32(a); + break; + + case OVS_ACTION_ATTR_USERSPACE: + output_userspace(dp, skb, key, a); + break; + + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, key, nla_get_be16(a)); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + err = push_vlan(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_VLAN: + err = pop_vlan(skb, key); + break; + + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (nla_is_last(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + + case OVS_ACTION_ATTR_SET: + err = execute_set_action(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = execute_masked_set_action(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample(dp, skb, key, a); + break; + } + + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + } + + if (prev_port != -1) + do_output(dp, skb, prev_port); + else + consume_skb(skb); + + return 0; +} + +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + +/* Execute a list of actions against 'skb'. */ +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *acts, + struct sw_flow_key *key) +{ + int level = this_cpu_read(exec_actions_level); + int err; + + this_cpu_inc(exec_actions_level); + OVS_CB(skb)->egress_tun_info = NULL; + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); + + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; + + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); +} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c new file mode 100644 index 000000000..096c6276e --- /dev/null +++ b/net/openvswitch/datapath.c @@ -0,0 +1,2333 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "flow.h" +#include "flow_table.h" +#include "flow_netlink.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +int ovs_net_id __read_mostly; +EXPORT_SYMBOL_GPL(ovs_net_id); + +static struct genl_family dp_packet_genl_family; +static struct genl_family dp_flow_genl_family; +static struct genl_family dp_datapath_genl_family; + +static const struct nla_policy flow_policy[]; + +static const struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP, +}; + +static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP, +}; + +static const struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP, +}; + +/* Check if need to build a reply message. + * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ +static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, + unsigned int group) +{ + return info->nlhdr->nlmsg_flags & NLM_F_ECHO || + genl_has_listeners(family, genl_info_net(info), group); +} + +static void ovs_notify(struct genl_family *family, + struct sk_buff *skb, struct genl_info *info) +{ + genl_notify(family, skb, genl_info_net(info), info->snd_portid, + 0, info->nlhdr, GFP_KERNEL); +} + +/** + * DOC: Locking: + * + * All writes e.g. Writes to device state (add/remove datapath, port, set + * operations on vports, etc.), Writes to other state (flow table + * modifications, set miscellaneous datapath parameters, etc.) are protected + * by ovs_lock. + * + * Reads are protected by RCU. + * + * There are a few special cases (mostly stats) that have their own + * synchronization but they nest under all of above and don't interact with + * each other. + * + * The RTNL lock nests inside ovs_mutex. + */ + +static DEFINE_MUTEX(ovs_mutex); + +void ovs_lock(void) +{ + mutex_lock(&ovs_mutex); +} + +void ovs_unlock(void) +{ + mutex_unlock(&ovs_mutex); +} + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void) +{ + if (debug_locks) + return lockdep_is_held(&ovs_mutex); + else + return 1; +} +EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); +#endif + +static struct vport *new_vport(const struct vport_parms *); +static int queue_gso_packets(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, + const struct dp_upcall_info *); +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, + const struct dp_upcall_info *); + +/* Must be called with rcu_read_lock. */ +static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); + + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + if (vport) + return vport->dp; + } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); + rcu_read_unlock(); + + return dp; +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +const char *ovs_dp_name(const struct datapath *dp) +{ + struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); + return vport->ops->get_name(vport); +} + +static int get_dpifindex(const struct datapath *dp) +{ + struct vport *local; + int ifindex; + + rcu_read_lock(); + + local = ovs_vport_rcu(dp, OVSP_LOCAL); + if (local) + ifindex = netdev_vport_priv(local)->dev->ifindex; + else + ifindex = 0; + + rcu_read_unlock(); + + return ifindex; +} + +static void destroy_dp_rcu(struct rcu_head *rcu) +{ + struct datapath *dp = container_of(rcu, struct datapath, rcu); + + ovs_flow_tbl_destroy(&dp->table); + free_percpu(dp->stats_percpu); + kfree(dp->ports); + kfree(dp); +} + +static struct hlist_head *vport_hash_bucket(const struct datapath *dp, + u16 port_no) +{ + return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; +} + +/* Called with ovs_mutex or RCU read lock. */ +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) +{ + struct vport *vport; + struct hlist_head *head; + + head = vport_hash_bucket(dp, port_no); + hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + if (vport->port_no == port_no) + return vport; + } + return NULL; +} + +/* Called with ovs_mutex. */ +static struct vport *new_vport(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_add(parms); + if (!IS_ERR(vport)) { + struct datapath *dp = parms->dp; + struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); + + hlist_add_head_rcu(&vport->dp_hash_node, head); + } + return vport; +} + +void ovs_dp_detach_port(struct vport *p) +{ + ASSERT_OVSL(); + + /* First drop references to device. */ + hlist_del_rcu(&p->dp_hash_node); + + /* Then destroy it. */ + ovs_vport_del(p); +} + +/* Must be called with rcu_read_lock. */ +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) +{ + const struct vport *p = OVS_CB(skb)->input_vport; + struct datapath *dp = p->dp; + struct sw_flow *flow; + struct sw_flow_actions *sf_acts; + struct dp_stats_percpu *stats; + u64 *stats_counter; + u32 n_mask_hit; + + stats = this_cpu_ptr(dp->stats_percpu); + + /* Look up flow. */ + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + if (unlikely(!flow)) { + struct dp_upcall_info upcall; + int error; + + upcall.cmd = OVS_PACKET_CMD_MISS; + upcall.userdata = NULL; + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.egress_tun_info = NULL; + error = ovs_dp_upcall(dp, skb, key, &upcall); + if (unlikely(error)) + kfree_skb(skb); + else + consume_skb(skb); + stats_counter = &stats->n_missed; + goto out; + } + + ovs_flow_stats_update(flow, key->tp.flags, skb); + sf_acts = rcu_dereference(flow->sf_acts); + ovs_execute_actions(dp, skb, sf_acts, key); + + stats_counter = &stats->n_hit; + +out: + /* Update datapath statistics. */ + u64_stats_update_begin(&stats->syncp); + (*stats_counter)++; + stats->n_mask_hit += n_mask_hit; + u64_stats_update_end(&stats->syncp); +} + +int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info) +{ + struct dp_stats_percpu *stats; + int err; + + if (upcall_info->portid == 0) { + err = -ENOTCONN; + goto err; + } + + if (!skb_is_gso(skb)) + err = queue_userspace_packet(dp, skb, key, upcall_info); + else + err = queue_gso_packets(dp, skb, key, upcall_info); + if (err) + goto err; + + return 0; + +err: + stats = this_cpu_ptr(dp->stats_percpu); + + u64_stats_update_begin(&stats->syncp); + stats->n_lost++; + u64_stats_update_end(&stats->syncp); + + return err; +} + +static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info) +{ + unsigned short gso_type = skb_shinfo(skb)->gso_type; + struct sw_flow_key later_key; + struct sk_buff *segs, *nskb; + struct ovs_skb_cb ovs_cb; + int err; + + ovs_cb = *OVS_CB(skb); + segs = __skb_gso_segment(skb, NETIF_F_SG, false); + *OVS_CB(skb) = ovs_cb; + if (IS_ERR(segs)) + return PTR_ERR(segs); + if (segs == NULL) + return -EINVAL; + + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } + + /* Queue all of the segments. */ + skb = segs; + do { + *OVS_CB(skb) = ovs_cb; + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + + err = queue_userspace_packet(dp, skb, key, upcall_info); + if (err) + break; + + } while ((skb = skb->next)); + + /* Free all of the segments. */ + skb = segs; + do { + nskb = skb->next; + if (err) + kfree_skb(skb); + else + consume_skb(skb); + } while ((skb = nskb)); + return err; +} + +static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, + unsigned int hdrlen) +{ + size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + /* OVS_PACKET_ATTR_USERDATA */ + if (upcall_info->userdata) + size += NLA_ALIGN(upcall_info->userdata->nla_len); + + /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ + if (upcall_info->egress_tun_info) + size += nla_total_size(ovs_tun_key_attr_size()); + + return size; +} + +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info) +{ + struct ovs_header *upcall; + struct sk_buff *nskb = NULL; + struct sk_buff *user_skb = NULL; /* to be queued to userspace */ + struct nlattr *nla; + struct genl_info info = { + .dst_sk = ovs_dp_get_net(dp)->genl_sock, + .snd_portid = upcall_info->portid, + }; + size_t len; + unsigned int hlen; + int err, dp_ifindex; + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) + return -ENODEV; + + if (skb_vlan_tag_present(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb = __vlan_hwaccel_push_inside(nskb); + if (!nskb) + return -ENOMEM; + + skb = nskb; + } + + if (nla_attr_size(skb->len) > USHRT_MAX) { + err = -EFBIG; + goto out; + } + + /* Complete checksum if needed */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto out; + + /* Older versions of OVS user space enforce alignment of the last + * Netlink attribute to NLA_ALIGNTO which would require extensive + * padding logic. Only perform zerocopy if padding is not required. + */ + if (dp->user_features & OVS_DP_F_UNALIGNED) + hlen = skb_zerocopy_headlen(skb); + else + hlen = skb->len; + + len = upcall_msg_size(upcall_info, hlen); + user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + if (!user_skb) { + err = -ENOMEM; + goto out; + } + + upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, + 0, upcall_info->cmd); + upcall->dp_ifindex = dp_ifindex; + + err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); + BUG_ON(err); + + if (upcall_info->userdata) + __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, + nla_len(upcall_info->userdata), + nla_data(upcall_info->userdata)); + + if (upcall_info->egress_tun_info) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); + err = ovs_nla_put_egress_tunnel_key(user_skb, + upcall_info->egress_tun_info); + BUG_ON(err); + nla_nest_end(user_skb, nla); + } + + /* Only reserve room for attribute header, packet data is added + * in skb_zerocopy() */ + if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { + err = -ENOBUFS; + goto out; + } + nla->nla_len = nla_attr_size(skb->len); + + err = skb_zerocopy(user_skb, skb, skb->len, hlen); + if (err) + goto out; + + /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; + + if (plen > 0) + memset(skb_put(user_skb, plen), 0, plen); + } + + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; + + err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); + user_skb = NULL; +out: + if (err) + skb_tx_error(skb); + kfree_skb(user_skb); + kfree_skb(nskb); + return err; +} + +static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = info->userhdr; + struct nlattr **a = info->attrs; + struct sw_flow_actions *acts; + struct sk_buff *packet; + struct sw_flow *flow; + struct sw_flow_actions *sf_acts; + struct datapath *dp; + struct ethhdr *eth; + struct vport *input_vport; + int len; + int err; + bool log = !a[OVS_PACKET_ATTR_PROBE]; + + err = -EINVAL; + if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || + !a[OVS_PACKET_ATTR_ACTIONS]) + goto err; + + len = nla_len(a[OVS_PACKET_ATTR_PACKET]); + packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); + err = -ENOMEM; + if (!packet) + goto err; + skb_reserve(packet, NET_IP_ALIGN); + + nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); + + skb_reset_mac_header(packet); + eth = eth_hdr(packet); + + /* Normally, setting the skb 'protocol' field would be handled by a + * call to eth_type_trans(), but it assumes there's a sending + * device, which we may not have. */ + if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + packet->protocol = eth->h_proto; + else + packet->protocol = htons(ETH_P_802_2); + + /* Build an sw_flow for sending this packet. */ + flow = ovs_flow_alloc(); + err = PTR_ERR(flow); + if (IS_ERR(flow)) + goto err_kfree_skb; + + err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, + &flow->key, log); + if (err) + goto err_flow_free; + + err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + &flow->key, &acts, log); + if (err) + goto err_flow_free; + + rcu_assign_pointer(flow->sf_acts, acts); + OVS_CB(packet)->egress_tun_info = NULL; + packet->priority = flow->key.phy.priority; + packet->mark = flow->key.phy.skb_mark; + + rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto err_unlock; + + input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (!input_vport) + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); + + if (!input_vport) + goto err_unlock; + + OVS_CB(packet)->input_vport = input_vport; + sf_acts = rcu_dereference(flow->sf_acts); + + local_bh_disable(); + err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + local_bh_enable(); + rcu_read_unlock(); + + ovs_flow_free(flow, false); + return err; + +err_unlock: + rcu_read_unlock(); +err_flow_free: + ovs_flow_free(flow, false); +err_kfree_skb: + kfree_skb(packet); +err: + return err; +} + +static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { + [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, + [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, +}; + +static const struct genl_ops dp_packet_genl_ops[] = { + { .cmd = OVS_PACKET_CMD_EXECUTE, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = packet_policy, + .doit = ovs_packet_cmd_execute + } +}; + +static struct genl_family dp_packet_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_packet_genl_ops, + .n_ops = ARRAY_SIZE(dp_packet_genl_ops), +}; + +static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, + struct ovs_dp_megaflow_stats *mega_stats) +{ + int i; + + memset(mega_stats, 0, sizeof(*mega_stats)); + + stats->n_flows = ovs_flow_tbl_count(&dp->table); + mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); + + stats->n_hit = stats->n_missed = stats->n_lost = 0; + + for_each_possible_cpu(i) { + const struct dp_stats_percpu *percpu_stats; + struct dp_stats_percpu local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dp->stats_percpu, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->n_hit += local_stats.n_hit; + stats->n_missed += local_stats.n_missed; + stats->n_lost += local_stats.n_lost; + mega_stats->n_mask_hit += local_stats.n_mask_hit; + } +} + +static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) +{ + return ovs_identifier_is_ufid(sfid) && + !(ufid_flags & OVS_UFID_F_OMIT_KEY); +} + +static bool should_fill_mask(uint32_t ufid_flags) +{ + return !(ufid_flags & OVS_UFID_F_OMIT_MASK); +} + +static bool should_fill_actions(uint32_t ufid_flags) +{ + return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); +} + +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + uint32_t ufid_flags) +{ + size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); + + /* OVS_FLOW_ATTR_UFID */ + if (sfid && ovs_identifier_is_ufid(sfid)) + len += nla_total_size(sfid->ufid_len); + + /* OVS_FLOW_ATTR_KEY */ + if (!sfid || should_fill_key(sfid, ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); + + /* OVS_FLOW_ATTR_MASK */ + if (should_fill_mask(ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); + + /* OVS_FLOW_ATTR_ACTIONS */ + if (should_fill_actions(ufid_flags)) + len += nla_total_size(acts->actions_len); + + return len + + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + + nla_total_size(8); /* OVS_FLOW_ATTR_USED */ +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, + struct sk_buff *skb) +{ + struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; + + ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); + + if (used && + nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) + return -EMSGSIZE; + + if (stats.n_packets && + nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) + return -EMSGSIZE; + + if ((u8)ntohs(tcp_flags) && + nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) + return -EMSGSIZE; + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, + struct sk_buff *skb, int skb_orig_len) +{ + struct nlattr *start; + int err; + + /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if + * this is the first flow to be dumped into 'skb'. This is unusual for + * Netlink but individual action lists can be longer than + * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. + * The userspace caller can always fetch the actions separately if it + * really wants them. (Most userspace callers in fact don't care.) + * + * This can only fail for dump operations because the skb is always + * properly sized for single flows. + */ + start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); + if (start) { + const struct sw_flow_actions *sf_acts; + + sf_acts = rcu_dereference_ovsl(flow->sf_acts); + err = ovs_nla_put_actions(sf_acts->actions, + sf_acts->actions_len, skb); + + if (!err) + nla_nest_end(skb, start); + else { + if (skb_orig_len) + return err; + + nla_nest_cancel(skb, start); + } + } else if (skb_orig_len) { + return -EMSGSIZE; + } + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd, u32 ufid_flags) +{ + const int skb_orig_len = skb->len; + struct ovs_header *ovs_header; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = dp_ifindex; + + err = ovs_nla_put_identifier(flow, skb); + if (err) + goto error; + + if (should_fill_key(&flow->id, ufid_flags)) { + err = ovs_nla_put_masked_key(flow, skb); + if (err) + goto error; + } + + if (should_fill_mask(ufid_flags)) { + err = ovs_nla_put_mask(flow, skb); + if (err) + goto error; + } + + err = ovs_flow_cmd_fill_stats(flow, skb); + if (err) + goto error; + + if (should_fill_actions(ufid_flags)) { + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; + } + + genlmsg_end(skb, ovs_header); + return 0; + +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +/* May not be called with RCU read lock. */ +static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + struct genl_info *info, + bool always, + uint32_t ufid_flags) +{ + struct sk_buff *skb; + size_t len; + + if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) + return NULL; + + len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); + skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + return skb; +} + +/* Called with ovs_mutex. */ +static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, + int dp_ifindex, + struct genl_info *info, u8 cmd, + bool always, u32 ufid_flags) +{ + struct sk_buff *skb; + int retval; + + skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), + &flow->id, info, always, ufid_flags); + if (IS_ERR_OR_NULL(skb)) + return skb; + + retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, + info->snd_portid, info->snd_seq, 0, + cmd, ufid_flags); + BUG_ON(retval < 0); + return skb; +} + +static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow *flow = NULL, *new_flow; + struct sw_flow_mask mask; + struct sk_buff *reply; + struct datapath *dp; + struct sw_flow_key key; + struct sw_flow_actions *acts; + struct sw_flow_match match; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + + /* Must have key and actions. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attr not present in new flow."); + goto error; + } + if (!a[OVS_FLOW_ATTR_ACTIONS]) { + OVS_NLERR(log, "Flow actions attr not present in new flow."); + goto error; + } + + /* Most of the time we need to allocate a new flow, do it before + * locking. + */ + new_flow = ovs_flow_alloc(); + if (IS_ERR(new_flow)) { + error = PTR_ERR(new_flow); + goto error; + } + + /* Extract key. */ + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + if (error) + goto err_kfree_flow; + + ovs_flow_mask_key(&new_flow->key, &key, &mask); + + /* Extract flow identifier. */ + error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], + &key, log); + if (error) + goto err_kfree_flow; + + /* Validate actions. */ + error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, + &acts, log); + if (error) { + OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); + goto err_kfree_flow; + } + + reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, + ufid_flags); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + + /* Check if this is a duplicate flow */ + if (ovs_identifier_is_ufid(&new_flow->id)) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); + if (!flow) + flow = ovs_flow_tbl_lookup(&dp->table, &key); + if (likely(!flow)) { + rcu_assign_pointer(new_flow->sf_acts, acts); + + /* Put flow in bucket. */ + error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); + if (unlikely(error)) { + acts = NULL; + goto err_unlock_ovs; + } + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(new_flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + ovs_unlock(); + } else { + struct sw_flow_actions *old_acts; + + /* Bail out if we're not allowed to modify an existing flow. + * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL + * because Generic Netlink treats the latter as a dump + * request. We also accept NLM_F_EXCL in case that bug ever + * gets fixed. + */ + if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE + | NLM_F_EXCL))) { + error = -EEXIST; + goto err_unlock_ovs; + } + /* The flow identifier has to be the same for flow updates. + * Look for any overlapping flow. + */ + if (unlikely(!ovs_flow_cmp(flow, &match))) { + if (ovs_identifier_is_key(&flow->id)) + flow = ovs_flow_tbl_lookup_exact(&dp->table, + &match); + else /* UFID matches but key is different */ + flow = NULL; + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } + } + /* Update actions. */ + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + ovs_unlock(); + + ovs_nla_free_flow_actions(old_acts); + ovs_flow_free(new_flow, false); + } + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + return 0; + +err_unlock_ovs: + ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + kfree(acts); +err_kfree_flow: + ovs_flow_free(new_flow, false); +error: + return error; +} + +/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ +static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask, + bool log) +{ + struct sw_flow_actions *acts; + struct sw_flow_key masked_key; + int error; + + ovs_flow_mask_key(&masked_key, key, mask); + error = ovs_nla_copy_actions(a, &masked_key, &acts, log); + if (error) { + OVS_NLERR(log, + "Actions may not be safe on all matching packets"); + return ERR_PTR(error); + } + + return acts; +} + +static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sw_flow *flow; + struct sw_flow_mask mask; + struct sk_buff *reply = NULL; + struct datapath *dp; + struct sw_flow_actions *old_acts = NULL, *acts = NULL; + struct sw_flow_match match; + struct sw_flow_id sfid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + /* Extract key. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attribute not present in set flow."); + goto error; + } + + ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + if (error) + goto error; + + /* Validate actions. */ + if (a[OVS_FLOW_ATTR_ACTIONS]) { + acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, + log); + if (IS_ERR(acts)) { + error = PTR_ERR(acts); + goto error; + } + + /* Can allocate before locking if have acts. */ + reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, + ufid_flags); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + /* Check that the flow exists. */ + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { + error = -ENOENT; + goto err_unlock_ovs; + } + + /* Update actions, if present. */ + if (likely(acts)) { + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + } else { + /* Could not alloc without acts before locking. */ + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, + info, OVS_FLOW_CMD_NEW, false, + ufid_flags); + + if (unlikely(IS_ERR(reply))) { + error = PTR_ERR(reply); + goto err_unlock_ovs; + } + } + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) + ovs_flow_stats_clear(flow); + ovs_unlock(); + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + if (old_acts) + ovs_nla_free_flow_actions(old_acts); + + return 0; + +err_unlock_ovs: + ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + kfree(acts); +error: + return error; +} + +static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow; + struct datapath *dp; + struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int err = 0; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + log); + } else if (!ufid_present) { + OVS_NLERR(log, + "Flow get message rejected, Key attribute missing."); + err = -EINVAL; + } + if (err) + return err; + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto unlock; + } + + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { + err = -ENOENT; + goto unlock; + } + + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, + OVS_FLOW_CMD_NEW, true, ufid_flags); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto unlock; + } + + ovs_unlock(); + return genlmsg_reply(reply, info); +unlock: + ovs_unlock(); + return err; +} + +static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow = NULL; + struct datapath *dp; + struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + log); + if (unlikely(err)) + return err; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (unlikely(!dp)) { + err = -ENODEV; + goto unlock; + } + + if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { + err = ovs_flow_tbl_flush(&dp->table); + goto unlock; + } + + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { + err = -ENOENT; + goto unlock; + } + + ovs_flow_tbl_remove(&dp->table, flow); + ovs_unlock(); + + reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, + &flow->id, info, false, ufid_flags); + if (likely(reply)) { + if (likely(!IS_ERR(reply))) { + rcu_read_lock(); /*To keep RCU checker happy. */ + err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_DEL, + ufid_flags); + rcu_read_unlock(); + BUG_ON(err < 0); + + ovs_notify(&dp_flow_genl_family, reply, info); + } else { + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + } + } + + ovs_flow_free(flow, true); + return 0; +unlock: + ovs_unlock(); + return err; +} + +static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *a[__OVS_FLOW_ATTR_MAX]; + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct table_instance *ti; + struct datapath *dp; + u32 ufid_flags; + int err; + + err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a, + OVS_FLOW_ATTR_MAX, flow_policy); + if (err) + return err; + ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + + rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + rcu_read_unlock(); + return -ENODEV; + } + + ti = rcu_dereference(dp->table.ti); + for (;;) { + struct sw_flow *flow; + u32 bucket, obj; + + bucket = cb->args[0]; + obj = cb->args[1]; + flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); + if (!flow) + break; + + if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_FLOW_CMD_NEW, ufid_flags) < 0) + break; + + cb->args[0] = bucket; + cb->args[1] = obj; + } + rcu_read_unlock(); + return skb->len; +} + +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, + [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct genl_ops dp_flow_genl_ops[] = { + { .cmd = OVS_FLOW_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_new + }, + { .cmd = OVS_FLOW_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_del + }, + { .cmd = OVS_FLOW_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_get, + .dumpit = ovs_flow_cmd_dump + }, + { .cmd = OVS_FLOW_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_set, + }, +}; + +static struct genl_family dp_flow_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_flow_genl_ops, + .n_ops = ARRAY_SIZE(dp_flow_genl_ops), + .mcgrps = &ovs_dp_flow_multicast_group, + .n_mcgrps = 1, +}; + +static size_t ovs_dp_cmd_msg_size(void) +{ + size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); + + msgsize += nla_total_size(IFNAMSIZ); + msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + + return msgsize; +} + +/* Called with ovs_mutex. */ +static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_dp_stats dp_stats; + struct ovs_dp_megaflow_stats dp_megaflow_stats; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, + flags, cmd); + if (!ovs_header) + goto error; + + ovs_header->dp_ifindex = get_dpifindex(dp); + + err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); + if (err) + goto nla_put_failure; + + get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); + if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats)) + goto nla_put_failure; + + if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) + goto nla_put_failure; + + genlmsg_end(skb, ovs_header); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, ovs_header); +error: + return -EMSGSIZE; +} + +static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +{ + return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); +} + +/* Called with rcu_read_lock or ovs_mutex. */ +static struct datapath *lookup_datapath(struct net *net, + const struct ovs_header *ovs_header, + struct nlattr *a[OVS_DP_ATTR_MAX + 1]) +{ + struct datapath *dp; + + if (!a[OVS_DP_ATTR_NAME]) + dp = get_dp(net, ovs_header->dp_ifindex); + else { + struct vport *vport; + + vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); + dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; + } + return dp ? dp : ERR_PTR(-ENODEV); +} + +static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +{ + struct datapath *dp; + + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) + return; + + WARN(dp->user_features, "Dropping previously announced user features\n"); + dp->user_features = 0; +} + +static void ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +{ + if (a[OVS_DP_ATTR_USER_FEATURES]) + dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); +} + +static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct vport_parms parms; + struct sk_buff *reply; + struct datapath *dp; + struct vport *vport; + struct ovs_net *ovs_net; + int err, i; + + err = -EINVAL; + if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) + goto err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + err = -ENOMEM; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (dp == NULL) + goto err_free_reply; + + ovs_dp_set_net(dp, sock_net(skb->sk)); + + /* Allocate table. */ + err = ovs_flow_tbl_init(&dp->table); + if (err) + goto err_free_dp; + + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) { + err = -ENOMEM; + goto err_destroy_table; + } + + dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) { + err = -ENOMEM; + goto err_destroy_percpu; + } + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + /* Set up our datapath device. */ + parms.name = nla_data(a[OVS_DP_ATTR_NAME]); + parms.type = OVS_VPORT_TYPE_INTERNAL; + parms.options = NULL; + parms.dp = dp; + parms.port_no = OVSP_LOCAL; + parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; + + ovs_dp_change(dp, a); + + /* So far only local changes have been made, now need the lock. */ + ovs_lock(); + + vport = new_vport(&parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + if (err == -EBUSY) + err = -EEXIST; + + if (err == -EEXIST) { + /* An outdated user space instance that does not understand + * the concept of user_features has attempted to create a new + * datapath and is likely to reuse it. Drop all user features. + */ + if (info->genlhdr->version < OVS_DP_VER_FEATURES) + ovs_dp_reset_user_features(skb, info); + } + + goto err_destroy_ports_array; + } + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + + ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); + list_add_tail_rcu(&dp->list_node, &ovs_net->dps); + + ovs_unlock(); + + ovs_notify(&dp_datapath_genl_family, reply, info); + return 0; + +err_destroy_ports_array: + ovs_unlock(); + kfree(dp->ports); +err_destroy_percpu: + free_percpu(dp->stats_percpu); +err_destroy_table: + ovs_flow_tbl_destroy(&dp->table); +err_free_dp: + kfree(dp); +err_free_reply: + kfree_skb(reply); +err: + return err; +} + +/* Called with ovs_mutex. */ +static void __dp_destroy(struct datapath *dp) +{ + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) + if (vport->port_no != OVSP_LOCAL) + ovs_dp_detach_port(vport); + } + + list_del_rcu(&dp->list_node); + + /* OVSP_LOCAL is datapath internal port. We need to make sure that + * all ports in datapath are destroyed first before freeing datapath. + */ + ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + + /* RCU destroy the flow table */ + call_rcu(&dp->rcu, destroy_dp_rcu); +} + +static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto err_unlock_free; + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_DEL); + BUG_ON(err < 0); + + __dp_destroy(dp); + ovs_unlock(); + + ovs_notify(&dp_datapath_genl_family, reply, info); + + return 0; + +err_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto err_unlock_free; + + ovs_dp_change(dp, info->attrs); + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + + ovs_unlock(); + ovs_notify(&dp_datapath_genl_family, reply, info); + + return 0; + +err_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) { + err = PTR_ERR(dp); + goto err_unlock_free; + } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + ovs_unlock(); + + return genlmsg_reply(reply, info); + +err_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); + struct datapath *dp; + int skip = cb->args[0]; + int i = 0; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + if (i >= skip && + ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_DP_CMD_NEW) < 0) + break; + i++; + } + ovs_unlock(); + + cb->args[0] = i; + + return skb->len; +} + +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, +}; + +static const struct genl_ops dp_datapath_genl_ops[] = { + { .cmd = OVS_DP_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_new + }, + { .cmd = OVS_DP_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_del + }, + { .cmd = OVS_DP_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_get, + .dumpit = ovs_dp_cmd_dump + }, + { .cmd = OVS_DP_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_set, + }, +}; + +static struct genl_family dp_datapath_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_datapath_genl_ops, + .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .mcgrps = &ovs_dp_datapath_multicast_group, + .n_mcgrps = 1, +}; + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_vport_stats vport_stats; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = get_dpifindex(vport->dp); + + if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || + nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + vport->ops->get_name(vport))) + goto nla_put_failure; + + ovs_vport_get_stats(vport, &vport_stats); + if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), + &vport_stats)) + goto nla_put_failure; + + if (ovs_vport_get_upcall_portids(vport, skb)) + goto nla_put_failure; + + err = ovs_vport_get_options(vport, skb); + if (err == -EMSGSIZE) + goto error; + + genlmsg_end(skb, ovs_header); + return 0; + +nla_put_failure: + err = -EMSGSIZE; +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +static struct sk_buff *ovs_vport_cmd_alloc_info(void) +{ + return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +} + +/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, + u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); + BUG_ON(retval < 0); + + return skb; +} + +/* Called with ovs_mutex or RCU read lock. */ +static struct vport *lookup_vport(struct net *net, + const struct ovs_header *ovs_header, + struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) +{ + struct datapath *dp; + struct vport *vport; + + if (a[OVS_VPORT_ATTR_NAME]) { + vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); + if (!vport) + return ERR_PTR(-ENODEV); + if (ovs_header->dp_ifindex && + ovs_header->dp_ifindex != get_dpifindex(vport->dp)) + return ERR_PTR(-ENODEV); + return vport; + } else if (a[OVS_VPORT_ATTR_PORT_NO]) { + u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + + if (port_no >= DP_MAX_PORTS) + return ERR_PTR(-EFBIG); + + dp = get_dp(net, ovs_header->dp_ifindex); + if (!dp) + return ERR_PTR(-ENODEV); + + vport = ovs_vport_ovsl_rcu(dp, port_no); + if (!vport) + return ERR_PTR(-ENODEV); + return vport; + } else + return ERR_PTR(-EINVAL); +} + +static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct vport_parms parms; + struct sk_buff *reply; + struct vport *vport; + struct datapath *dp; + u32 port_no; + int err; + + if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || + !a[OVS_VPORT_ATTR_UPCALL_PID]) + return -EINVAL; + + port_no = a[OVS_VPORT_ATTR_PORT_NO] + ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; + if (port_no >= DP_MAX_PORTS) + return -EFBIG; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + ovs_lock(); +restart: + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto exit_unlock_free; + + if (port_no) { + vport = ovs_vport_ovsl(dp, port_no); + err = -EBUSY; + if (vport) + goto exit_unlock_free; + } else { + for (port_no = 1; ; port_no++) { + if (port_no >= DP_MAX_PORTS) { + err = -EFBIG; + goto exit_unlock_free; + } + vport = ovs_vport_ovsl(dp, port_no); + if (!vport) + break; + } + } + + parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + parms.options = a[OVS_VPORT_ATTR_OPTIONS]; + parms.dp = dp; + parms.port_no = port_no; + parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + vport = new_vport(&parms); + err = PTR_ERR(vport); + if (IS_ERR(vport)) { + if (err == -EAGAIN) + goto restart; + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + ovs_unlock(); + + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + ovs_lock(); + vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock_free; + + if (a[OVS_VPORT_ATTR_TYPE] && + nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { + err = -EINVAL; + goto exit_unlock_free; + } + + if (a[OVS_VPORT_ATTR_OPTIONS]) { + err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); + if (err) + goto exit_unlock_free; + } + + + if (a[OVS_VPORT_ATTR_UPCALL_PID]) { + struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + err = ovs_vport_set_upcall_portids(vport, ids); + if (err) + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + + ovs_unlock(); + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + ovs_lock(); + vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock_free; + + if (vport->port_no == OVSP_LOCAL) { + err = -EINVAL; + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_DEL); + BUG_ON(err < 0); + ovs_dp_detach_port(vport); + ovs_unlock(); + + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sk_buff *reply; + struct vport *vport; + int err; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + rcu_read_lock(); + vport = lookup_vport(sock_net(skb->sk), ovs_header, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock_free; + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + rcu_read_unlock(); + + return genlmsg_reply(reply, info); + +exit_unlock_free: + rcu_read_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct datapath *dp; + int bucket = cb->args[0], skip = cb->args[1]; + int i, j = 0; + + rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + rcu_read_unlock(); + return -ENODEV; + } + for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + j = 0; + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + if (j >= skip && + ovs_vport_cmd_fill_info(vport, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + OVS_VPORT_CMD_NEW) < 0) + goto out; + + j++; + } + skip = 0; + } +out: + rcu_read_unlock(); + + cb->args[0] = i; + cb->args[1] = j; + + return skb->len; +} + +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + +static const struct genl_ops dp_vport_genl_ops[] = { + { .cmd = OVS_VPORT_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_new + }, + { .cmd = OVS_VPORT_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_del + }, + { .cmd = OVS_VPORT_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_get, + .dumpit = ovs_vport_cmd_dump + }, + { .cmd = OVS_VPORT_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_set, + }, +}; + +struct genl_family dp_vport_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_vport_genl_ops, + .n_ops = ARRAY_SIZE(dp_vport_genl_ops), + .mcgrps = &ovs_dp_vport_multicast_group, + .n_mcgrps = 1, +}; + +static struct genl_family * const dp_genl_families[] = { + &dp_datapath_genl_family, + &dp_vport_genl_family, + &dp_flow_genl_family, + &dp_packet_genl_family, +}; + +static void dp_unregister_genl(int n_families) +{ + int i; + + for (i = 0; i < n_families; i++) + genl_unregister_family(dp_genl_families[i]); +} + +static int dp_register_genl(void) +{ + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { + + err = genl_register_family(dp_genl_families[i]); + if (err) + goto error; + } + + return 0; + +error: + dp_unregister_genl(i); + return err; +} + +static int __net_init ovs_init_net(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + INIT_LIST_HEAD(&ovs_net->dps); + INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + return 0; +} + +static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, + struct list_head *head) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + struct datapath *dp; + + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { + struct netdev_vport *netdev_vport; + + if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) + continue; + + netdev_vport = netdev_vport_priv(vport); + if (dev_net(netdev_vport->dev) == dnet) + list_add(&vport->detach_list, head); + } + } + } +} + +static void __net_exit ovs_exit_net(struct net *dnet) +{ + struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); + struct vport *vport, *vport_next; + struct net *net; + LIST_HEAD(head); + + ovs_lock(); + list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) + __dp_destroy(dp); + + rtnl_lock(); + for_each_net(net) + list_vports_from_net(net, dnet, &head); + rtnl_unlock(); + + /* Detach all vports from given namespace. */ + list_for_each_entry_safe(vport, vport_next, &head, detach_list) { + list_del(&vport->detach_list); + ovs_dp_detach_port(vport); + } + + ovs_unlock(); + + cancel_work_sync(&ovs_net->dp_notify_work); +} + +static struct pernet_operations ovs_net_ops = { + .init = ovs_init_net, + .exit = ovs_exit_net, + .id = &ovs_net_id, + .size = sizeof(struct ovs_net), +}; + +static int __init dp_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + + pr_info("Open vSwitch switching datapath\n"); + + err = action_fifos_init(); + if (err) + goto error; + + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + + err = ovs_flow_init(); + if (err) + goto error_unreg_rtnl_link; + + err = ovs_vport_init(); + if (err) + goto error_flow_exit; + + err = register_pernet_device(&ovs_net_ops); + if (err) + goto error_vport_exit; + + err = register_netdevice_notifier(&ovs_dp_device_notifier); + if (err) + goto error_netns_exit; + + err = ovs_netdev_init(); + if (err) + goto error_unreg_notifier; + + err = dp_register_genl(); + if (err < 0) + goto error_unreg_netdev; + + return 0; + +error_unreg_netdev: + ovs_netdev_exit(); +error_unreg_notifier: + unregister_netdevice_notifier(&ovs_dp_device_notifier); +error_netns_exit: + unregister_pernet_device(&ovs_net_ops); +error_vport_exit: + ovs_vport_exit(); +error_flow_exit: + ovs_flow_exit(); +error_unreg_rtnl_link: + ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); +error: + return err; +} + +static void dp_cleanup(void) +{ + dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + ovs_netdev_exit(); + unregister_netdevice_notifier(&ovs_dp_device_notifier); + unregister_pernet_device(&ovs_net_ops); + rcu_barrier(); + ovs_vport_exit(); + ovs_flow_exit(); + ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); +} + +module_init(dp_init); +module_exit(dp_cleanup); + +MODULE_DESCRIPTION("Open vSwitch switching datapath"); +MODULE_LICENSE("GPL"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h new file mode 100644 index 000000000..4ec4a480b --- /dev/null +++ b/net/openvswitch/datapath.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef DATAPATH_H +#define DATAPATH_H 1 + +#include +#include +#include +#include +#include +#include + +#include "flow.h" +#include "flow_table.h" +#include "vport.h" + +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 + +#define SAMPLE_ACTION_DEPTH 3 + +/** + * struct dp_stats_percpu - per-cpu packet processing statistics for a given + * datapath. + * @n_hit: Number of received packets for which a matching flow was found in + * the flow table. + * @n_miss: Number of received packets that had no matching flow in the flow + * table. The sum of @n_hit and @n_miss is the number of packets that have + * been received by the datapath. + * @n_lost: Number of received packets that had no matching flow in the flow + * table that could not be sent to userspace (normally due to an overflow in + * one of the datapath's queues). + * @n_mask_hit: Number of masks looked up for flow match. + * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked + * up per packet. + */ +struct dp_stats_percpu { + u64 n_hit; + u64 n_missed; + u64 n_lost; + u64 n_mask_hit; + struct u64_stats_sync syncp; +}; + +/** + * struct datapath - datapath for flow-based packet switching + * @rcu: RCU callback head for deferred destruction. + * @list_node: Element in global 'dps' list. + * @table: flow table. + * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by + * ovs_mutex and RCU. + * @stats_percpu: Per-CPU datapath statistics. + * @net: Reference to net namespace. + * + * Context: See the comment on locking at the top of datapath.c for additional + * locking information. + */ +struct datapath { + struct rcu_head rcu; + struct list_head list_node; + + /* Flow table. */ + struct flow_table table; + + /* Switch ports. */ + struct hlist_head *ports; + + /* Stats. */ + struct dp_stats_percpu __percpu *stats_percpu; + + /* Network namespace ref. */ + possible_net_t net; + + u32 user_features; +}; + +/** + * struct ovs_skb_cb - OVS data in skb CB + * @egress_tun_key: Tunnel information about this packet on egress path. + * NULL if the packet is not being tunneled. + * @input_vport: The original vport packet came in on. This value is cached + * when a packet is received by OVS. + */ +struct ovs_skb_cb { + struct ovs_tunnel_info *egress_tun_info; + struct vport *input_vport; +}; +#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) + +/** + * struct dp_upcall - metadata to include with a packet to send to userspace + * @cmd: One of %OVS_PACKET_CMD_*. + * @userdata: If nonnull, its variable-length value is passed to userspace as + * %OVS_PACKET_ATTR_USERDATA. + * @portid: Netlink portid to which packet should be sent. If @portid is 0 + * then no packet is sent and the packet is accounted in the datapath's @n_lost + * counter. + * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + */ +struct dp_upcall_info { + const struct ovs_tunnel_info *egress_tun_info; + const struct nlattr *userdata; + u32 portid; + u8 cmd; +}; + +/** + * struct ovs_net - Per net-namespace data for ovs. + * @dps: List of datapaths to enable dumping them all out. + * Protected by genl_mutex. + */ +struct ovs_net { + struct list_head dps; + struct work_struct dp_notify_work; + struct vport_net vport_net; +}; + +extern int ovs_net_id; +void ovs_lock(void); +void ovs_unlock(void); + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void); +#else +#define lockdep_ovsl_is_held() 1 +#endif + +#define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) +#define ovsl_dereference(p) \ + rcu_dereference_protected(p, lockdep_ovsl_is_held()) +#define rcu_dereference_ovsl(p) \ + rcu_dereference_check(p, lockdep_ovsl_is_held()) + +static inline struct net *ovs_dp_get_net(const struct datapath *dp) +{ + return read_pnet(&dp->net); +} + +static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) +{ + write_pnet(&dp->net, net); +} + +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); + +static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) +{ + ASSERT_OVSL(); + return ovs_lookup_vport(dp, port_no); +} + +extern struct notifier_block ovs_dp_device_notifier; +extern struct genl_family dp_vport_genl_family; + +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); +void ovs_dp_detach_port(struct vport *); +int ovs_dp_upcall(struct datapath *, struct sk_buff *, + const struct sw_flow_key *, const struct dp_upcall_info *); + +const char *ovs_dp_name(const struct datapath *dp); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, + u8 cmd); + +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *, struct sw_flow_key *); + +void ovs_dp_notify_wq(struct work_struct *work); + +int action_fifos_init(void); +void action_fifos_exit(void); + +#define OVS_NLERR(logging_allowed, fmt, ...) \ +do { \ + if (logging_allowed && net_ratelimit()) \ + pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ +} while (0) +#endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c new file mode 100644 index 000000000..2c631fe76 --- /dev/null +++ b/net/openvswitch/dp_notify.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static void dp_detach_port_notify(struct vport *vport) +{ + struct sk_buff *notify; + struct datapath *dp; + + dp = vport->dp; + notify = ovs_vport_cmd_build_info(vport, 0, 0, + OVS_VPORT_CMD_DEL); + ovs_dp_detach_port(vport); + if (IS_ERR(notify)) { + genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, + 0, PTR_ERR(notify)); + return; + } + + genlmsg_multicast_netns(&dp_vport_genl_family, + ovs_dp_get_net(dp), notify, 0, + 0, GFP_KERNEL); +} + +void ovs_dp_notify_wq(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); + struct datapath *dp; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { + struct netdev_vport *netdev_vport; + + if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + continue; + + netdev_vport = netdev_vport_priv(vport); + if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) + dp_detach_port_notify(vport); + } + } + } + ovs_unlock(); +} + +static int dp_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct ovs_net *ovs_net; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct vport *vport = NULL; + + if (!ovs_is_internal_dev(dev)) + vport = ovs_netdev_get_vport(dev); + + if (!vport) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + /* upper_dev_unlink and decrement promisc immediately */ + ovs_netdev_detach_dev(vport); + + /* schedule vport destroy, dev_put and genl notification */ + ovs_net = net_generic(dev_net(dev), ovs_net_id); + queue_work(system_wq, &ovs_net->dp_notify_work); + } + + return NOTIFY_DONE; +} + +struct notifier_block ovs_dp_device_notifier = { + .notifier_call = dp_device_event +}; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c new file mode 100644 index 000000000..2dacc7b5a --- /dev/null +++ b/net/openvswitch/flow.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "flow.h" +#include "flow_netlink.h" + +u64 ovs_flow_used_time(unsigned long flow_jiffies) +{ + struct timespec cur_ts; + u64 cur_ms, idle_ms; + + ktime_get_ts(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; + + return cur_ms - idle_ms; +} + +#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) + +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + const struct sk_buff *skb) +{ + struct flow_stats *stats; + int node = numa_node_id(); + int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + stats = rcu_dereference(flow->stats[node]); + + /* Check if already have node-specific stats. */ + if (likely(stats)) { + spin_lock(&stats->lock); + /* Mark if we write on the pre-allocated stats. */ + if (node == 0 && unlikely(flow->stats_last_writer != node)) + flow->stats_last_writer = node; + } else { + stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ + spin_lock(&stats->lock); + + /* If the current NUMA-node is the only writer on the + * pre-allocated stats keep using them. + */ + if (unlikely(flow->stats_last_writer != node)) { + /* A previous locker may have already allocated the + * stats, so we need to check again. If node-specific + * stats were already allocated, we update the pre- + * allocated stats as we have already locked them. + */ + if (likely(flow->stats_last_writer != NUMA_NO_NODE) + && likely(!rcu_access_pointer(flow->stats[node]))) { + /* Try to allocate node-specific stats. */ + struct flow_stats *new_stats; + + new_stats = + kmem_cache_alloc_node(flow_stats_cache, + GFP_NOWAIT | + __GFP_THISNODE | + __GFP_NOWARN | + __GFP_NOMEMALLOC, + node); + if (likely(new_stats)) { + new_stats->used = jiffies; + new_stats->packet_count = 1; + new_stats->byte_count = len; + new_stats->tcp_flags = tcp_flags; + spin_lock_init(&new_stats->lock); + + rcu_assign_pointer(flow->stats[node], + new_stats); + goto unlock; + } + } + flow->stats_last_writer = node; + } + } + + stats->used = jiffies; + stats->packet_count++; + stats->byte_count += len; + stats->tcp_flags |= tcp_flags; +unlock: + spin_unlock(&stats->lock); +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +void ovs_flow_stats_get(const struct sw_flow *flow, + struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + int node; + + *used = 0; + *tcp_flags = 0; + memset(ovs_stats, 0, sizeof(*ovs_stats)); + + for_each_node(node) { + struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); + + if (stats) { + /* Local CPU may write on non-local stats, so we must + * block bottom-halves here. + */ + spin_lock_bh(&stats->lock); + if (!*used || time_after(stats->used, *used)) + *used = stats->used; + *tcp_flags |= stats->tcp_flags; + ovs_stats->n_packets += stats->packet_count; + ovs_stats->n_bytes += stats->byte_count; + spin_unlock_bh(&stats->lock); + } + } +} + +/* Called with ovs_mutex. */ +void ovs_flow_stats_clear(struct sw_flow *flow) +{ + int node; + + for_each_node(node) { + struct flow_stats *stats = ovsl_dereference(flow->stats[node]); + + if (stats) { + spin_lock_bh(&stats->lock); + stats->used = 0; + stats->packet_count = 0; + stats->byte_count = 0; + stats->tcp_flags = 0; + spin_unlock_bh(&stats->lock); + } + } +} + +static int check_header(struct sk_buff *skb, int len) +{ + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + return 0; +} + +static bool arphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_network_offset(skb) + + sizeof(struct arp_eth_header)); +} + +static int check_iphdr(struct sk_buff *skb) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int ip_len; + int err; + + err = check_header(skb, nh_ofs + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + ip_len = ip_hdrlen(skb); + if (unlikely(ip_len < sizeof(struct iphdr) || + skb->len < nh_ofs + ip_len)) + return -EINVAL; + + skb_set_transport_header(skb, nh_ofs + ip_len); + return 0; +} + +static bool tcphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + int tcp_len; + + if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) + return false; + + tcp_len = tcp_hdrlen(skb); + if (unlikely(tcp_len < sizeof(struct tcphdr) || + skb->len < th_ofs + tcp_len)) + return false; + + return true; +} + +static bool udphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); +} + +static bool sctphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr)); +} + +static bool icmphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmphdr)); +} + +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int nh_len; + int payload_ofs; + struct ipv6hdr *nh; + uint8_t nexthdr; + __be16 frag_off; + int err; + + err = check_header(skb, nh_ofs + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + nexthdr = nh->nexthdr; + payload_ofs = (u8 *)(nh + 1) - skb->data; + + key->ip.proto = NEXTHDR_NONE; + key->ip.tos = ipv6_get_dsfield(nh); + key->ip.ttl = nh->hop_limit; + key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + key->ipv6.addr.src = nh->saddr; + key->ipv6.addr.dst = nh->daddr; + + payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); + if (unlikely(payload_ofs < 0)) + return -EINVAL; + + if (frag_off) { + if (frag_off & htons(~0x7)) + key->ip.frag = OVS_FRAG_TYPE_LATER; + else + key->ip.frag = OVS_FRAG_TYPE_FIRST; + } else { + key->ip.frag = OVS_FRAG_TYPE_NONE; + } + + nh_len = payload_ofs - nh_ofs; + skb_set_transport_header(skb, nh_ofs + nh_len); + key->ip.proto = nexthdr; + return nh_len; +} + +static bool icmp6hdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmp6hdr)); +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct qtag_prefix { + __be16 eth_type; /* ETH_P_8021Q */ + __be16 tci; + }; + struct qtag_prefix *qp; + + if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + return 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + + sizeof(__be16)))) + return -ENOMEM; + + qp = (struct qtag_prefix *) skb->data; + key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); + __skb_pull(skb, sizeof(struct qtag_prefix)); + + return 0; +} + +static __be16 parse_ethertype(struct sk_buff *skb) +{ + struct llc_snap_hdr { + u8 dsap; /* Always 0xAA */ + u8 ssap; /* Always 0xAA */ + u8 ctrl; + u8 oui[3]; + __be16 ethertype; + }; + struct llc_snap_hdr *llc; + __be16 proto; + + proto = *(__be16 *) skb->data; + __skb_pull(skb, sizeof(__be16)); + + if (ntohs(proto) >= ETH_P_802_3_MIN) + return proto; + + if (skb->len < sizeof(struct llc_snap_hdr)) + return htons(ETH_P_802_2); + + if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) + return htons(0); + + llc = (struct llc_snap_hdr *) skb->data; + if (llc->dsap != LLC_SAP_SNAP || + llc->ssap != LLC_SAP_SNAP || + (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) + return htons(ETH_P_802_2); + + __skb_pull(skb, sizeof(struct llc_snap_hdr)); + + if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + return llc->ethertype; + + return htons(ETH_P_802_2); +} + +static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, + int nh_len) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + + /* The ICMPv6 type and code fields use the 16-bit transport port + * fields, so we need to store them in 16-bit network byte order. + */ + key->tp.src = htons(icmp->icmp6_type); + key->tp.dst = htons(icmp->icmp6_code); + memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); + + if (icmp->icmp6_code == 0 && + (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { + int icmp_len = skb->len - skb_transport_offset(skb); + struct nd_msg *nd; + int offset; + + /* In order to process neighbor discovery options, we need the + * entire packet. + */ + if (unlikely(icmp_len < sizeof(*nd))) + return 0; + + if (unlikely(skb_linearize(skb))) + return -ENOMEM; + + nd = (struct nd_msg *)skb_transport_header(skb); + key->ipv6.nd.target = nd->target; + + icmp_len -= sizeof(*nd); + offset = 0; + while (icmp_len >= 8) { + struct nd_opt_hdr *nd_opt = + (struct nd_opt_hdr *)(nd->opt + offset); + int opt_len = nd_opt->nd_opt_len * 8; + + if (unlikely(!opt_len || opt_len > icmp_len)) + return 0; + + /* Store the link layer address if the appropriate + * option is provided. It is considered an error if + * the same link layer option is specified twice. + */ + if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) + goto invalid; + ether_addr_copy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)]); + } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) + goto invalid; + ether_addr_copy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)]); + } + + icmp_len -= opt_len; + offset += opt_len; + } + } + + return 0; + +invalid: + memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); + memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); + memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); + + return 0; +} + +/** + * key_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @key: output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Returns 0 if successful, otherwise a negative errno value. + * + * Initializes @skb header pointers as follows: + * + * - skb->mac_header: the Ethernet header. + * + * - skb->network_header: just past the Ethernet header, or just past the + * VLAN header, to the first byte of the Ethernet payload. + * + * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 + * on output, then just past the IP header, if one is present and + * of a correct length, otherwise the same as skb->network_header. + * For other key->eth.type values it is left untouched. + */ +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) +{ + int error; + struct ethhdr *eth; + + /* Flags are always used as part of stats */ + key->tp.flags = 0; + + skb_reset_mac_header(skb); + + /* Link layer. We are guaranteed to have at least the 14 byte Ethernet + * header in the linear data area. + */ + eth = eth_hdr(skb); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); + + __skb_pull(skb, 2 * ETH_ALEN); + /* We are going to push all headers that we pull, so no need to + * update skb->csum here. + */ + + key->eth.tci = 0; + if (skb_vlan_tag_present(skb)) + key->eth.tci = htons(skb->vlan_tci); + else if (eth->h_proto == htons(ETH_P_8021Q)) + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; + + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + + /* Network layer. */ + if (key->eth.type == htons(ETH_P_IP)) { + struct iphdr *nh; + __be16 offset; + + error = check_iphdr(skb); + if (unlikely(error)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); + if (error == -EINVAL) { + skb->transport_header = skb->network_header; + error = 0; + } + return error; + } + + nh = ip_hdr(skb); + key->ipv4.addr.src = nh->saddr; + key->ipv4.addr.dst = nh->daddr; + + key->ip.proto = nh->protocol; + key->ip.tos = nh->tos; + key->ip.ttl = nh->ttl; + + offset = nh->frag_off & htons(IP_OFFSET); + if (offset) { + key->ip.frag = OVS_FRAG_TYPE_LATER; + return 0; + } + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; + + /* Transport layer. */ + if (key->ip.proto == IPPROTO_TCP) { + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + + } else if (key->ip.proto == IPPROTO_UDP) { + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->tp.src = udp->source; + key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == IPPROTO_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == IPPROTO_ICMP) { + if (icmphdr_ok(skb)) { + struct icmphdr *icmp = icmp_hdr(skb); + /* The ICMP type and code fields use the 16-bit + * transport port fields, so we need to store + * them in 16-bit network byte order. */ + key->tp.src = htons(icmp->type); + key->tp.dst = htons(icmp->code); + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } + + } else if (key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) { + struct arp_eth_header *arp; + bool arp_available = arphdr_ok(skb); + + arp = (struct arp_eth_header *)skb_network_header(skb); + + if (arp_available && + arp->ar_hrd == htons(ARPHRD_ETHER) && + arp->ar_pro == htons(ETH_P_IP) && + arp->ar_hln == ETH_ALEN && + arp->ar_pln == 4) { + + /* We only match on the lower 8 bits of the opcode. */ + if (ntohs(arp->ar_op) <= 0xff) + key->ip.proto = ntohs(arp->ar_op); + else + key->ip.proto = 0; + + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); + memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); + ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); + ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); + } else { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); + } + } else if (eth_p_mpls(key->eth.type)) { + size_t stack_len = MPLS_HLEN; + + /* In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * Advance network_header to the beginning of the L3 + * header. mac_len corresponds to the end of the L2 header. + */ + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + stack_len); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + + if (stack_len == MPLS_HLEN) + memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + + skb_set_network_header(skb, skb->mac_len + stack_len); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + stack_len += MPLS_HLEN; + } + } else if (key->eth.type == htons(ETH_P_IPV6)) { + int nh_len; /* IPv6 Header + Extensions */ + + nh_len = parse_ipv6hdr(skb, key); + if (unlikely(nh_len < 0)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + if (nh_len == -EINVAL) { + skb->transport_header = skb->network_header; + error = 0; + } else { + error = nh_len; + } + return error; + } + + if (key->ip.frag == OVS_FRAG_TYPE_LATER) + return 0; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + + /* Transport layer. */ + if (key->ip.proto == NEXTHDR_TCP) { + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_UDP) { + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->tp.src = udp->source; + key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_ICMP) { + if (icmp6hdr_ok(skb)) { + error = parse_icmpv6(skb, key, nh_len); + if (error) + return error; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } + } + return 0; +} + +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + +int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, + struct sk_buff *skb, struct sw_flow_key *key) +{ + /* Extract metadata from packet. */ + if (tun_info) { + memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + + if (tun_info->options) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info->options, tun_info->options_len); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_opts_len = 0; + memset(&key->tun_key, 0, sizeof(key->tun_key)); + } + + key->phy.priority = skb->priority; + key->phy.in_port = OVS_CB(skb)->input_vport->port_no; + key->phy.skb_mark = skb->mark; + key->ovs_flow_hash = 0; + key->recirc_id = 0; + + return key_extract(skb, key); +} + +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key, bool log) +{ + int err; + + memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); + + /* Extract metadata from netlink attributes. */ + err = ovs_nla_get_flow_metadata(attr, key, log); + if (err) + return err; + + return key_extract(skb, key); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h new file mode 100644 index 000000000..a076e445c --- /dev/null +++ b/net/openvswitch/flow.h @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_H +#define FLOW_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct sk_buff; + +/* Used to memset ovs_key_ipv4_tunnel padding. */ +#define OVS_TUNNEL_KEY_SIZE \ + (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ + FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) + +struct ovs_key_ipv4_tunnel { + __be64 tun_id; + __be32 ipv4_src; + __be32 ipv4_dst; + __be16 tun_flags; + u8 ipv4_tos; + u8 ipv4_ttl; + __be16 tp_src; + __be16 tp_dst; +} __packed __aligned(4); /* Minimize padding. */ + +struct ovs_tunnel_info { + struct ovs_key_ipv4_tunnel tunnel; + const void *options; + u8 options_len; +}; + +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define TUN_METADATA_OFFSET(opt_len) \ + (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len) +#define TUN_METADATA_OPTS(flow_key, opt_len) \ + ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) + +static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, + __be16 tp_dst, + __be64 tun_id, + __be16 tun_flags, + const void *opts, + u8 opts_len) +{ + tun_info->tunnel.tun_id = tun_id; + tun_info->tunnel.ipv4_src = saddr; + tun_info->tunnel.ipv4_dst = daddr; + tun_info->tunnel.ipv4_tos = tos; + tun_info->tunnel.ipv4_ttl = ttl; + tun_info->tunnel.tun_flags = tun_flags; + + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + tun_info->tunnel.tp_src = tp_src; + tun_info->tunnel.tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, + 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; +} + +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be16 tp_src, + __be16 tp_dst, + __be64 tun_id, + __be16 tun_flags, + const void *opts, + u8 opts_len) +{ + __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, + iph->tos, iph->ttl, + tp_src, tp_dst, + tun_id, tun_flags, + opts, opts_len); +} + +#define OVS_SW_FLOW_KEY_METADATA_SIZE \ + (offsetof(struct sw_flow_key, recirc_id) + \ + FIELD_SIZEOF(struct sw_flow_key, recirc_id)) + +struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; + struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct { + u32 priority; /* Packet QoS priority. */ + u32 skb_mark; /* SKB mark. */ + u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ + } __packed phy; /* Safe when right after 'tun_key'. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ + struct { + u8 src[ETH_ALEN]; /* Ethernet source address. */ + u8 dst[ETH_ALEN]; /* Ethernet destination address. */ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + __be16 type; /* Ethernet frame type. */ + } eth; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; + struct { + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ + } tp; + union { + struct { + struct { + __be32 src; /* IP source address. */ + __be32 dst; /* IP destination address. */ + } addr; + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; + } ipv4; + struct { + struct { + struct in6_addr src; /* IPv6 source address. */ + struct in6_addr dst; /* IPv6 destination address. */ + } addr; + __be32 label; /* IPv6 flow label. */ + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + } ipv6; + }; +} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ + +struct sw_flow_key_range { + unsigned short int start; + unsigned short int end; +}; + +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; + +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +#define MAX_UFID_LENGTH 16 /* 128 bits */ + +struct sw_flow_id { + u32 ufid_len; + union { + u32 ufid[MAX_UFID_LENGTH / 4]; + struct sw_flow_key *unmasked_key; + }; +}; + +struct sw_flow_actions { + struct rcu_head rcu; + u32 actions_len; + struct nlattr actions[]; +}; + +struct flow_stats { + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + unsigned long used; /* Last used time (in jiffies). */ + spinlock_t lock; /* Lock for atomic stats update. */ + __be16 tcp_flags; /* Union of seen TCP flags. */ +}; + +struct sw_flow { + struct rcu_head rcu; + struct { + struct hlist_node node[2]; + u32 hash; + } flow_table, ufid_table; + int stats_last_writer; /* NUMA-node id of the last writer on + * 'stats[0]'. + */ + struct sw_flow_key key; + struct sw_flow_id id; + struct sw_flow_mask *mask; + struct sw_flow_actions __rcu *sf_acts; + struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one + * is allocated at flow creation time, + * the rest are allocated on demand + * while holding the 'stats[0].lock'. + */ +}; + +struct arp_eth_header { + __be16 ar_hrd; /* format of hardware address */ + __be16 ar_pro; /* format of protocol address */ + unsigned char ar_hln; /* length of hardware address */ + unsigned char ar_pln; /* length of protocol address */ + __be16 ar_op; /* ARP opcode (command) */ + + /* Ethernet+IPv4 specific members. */ + unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + unsigned char ar_sip[4]; /* sender IP address */ + unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + unsigned char ar_tip[4]; /* target IP address */ +} __packed; + +static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) +{ + return sfid->ufid_len; +} + +static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid) +{ + return !ovs_identifier_is_ufid(sfid); +} + +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + const struct sk_buff *); +void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, + unsigned long *used, __be16 *tcp_flags); +void ovs_flow_stats_clear(struct sw_flow *); +u64 ovs_flow_used_time(unsigned long flow_jiffies); + +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, + struct sk_buff *skb, + struct sw_flow_key *key); +/* Extract key from packet coming from userspace. */ +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key, bool log); + +#endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c new file mode 100644 index 000000000..c691b1a1e --- /dev/null +++ b/net/openvswitch/flow_netlink.c @@ -0,0 +1,2309 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "flow.h" +#include "datapath.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "flow_netlink.h" +#include "vport-vxlan.h" + +struct ovs_len_tbl { + int len; + const struct ovs_len_tbl *next; +}; + +#define OVS_ATTR_NESTED -1 + +static void update_range(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range; + size_t start = rounddown(offset, sizeof(long)); + size_t end = roundup(offset + size, sizeof(long)); + + if (!is_mask) + range = &match->range; + else + range = &match->mask->range; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + (match)->mask->key.field = value; \ + else \ + (match)->key->field = value; \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + memset((u8 *)&(match)->mask->key.field, value, \ + sizeof((match)->mask->key.field)); \ + else \ + memset((u8 *)&(match)->key->field, value, \ + sizeof((match)->key->field)); \ + } while (0) + +static bool match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs, bool log) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_TCP_FLAGS) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.tp.src == htons(0xff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", + (unsigned long long)key_attrs, + (unsigned long long)key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)", + (unsigned long long)mask_attrs, + (unsigned long long)mask_allowed); + return false; + } + + return true; +} + +size_t ovs_tun_key_attr_size(void) +{ + /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with + * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. + */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ +} + +size_t ovs_key_attr_size(void) +{ + /* Whenever adding new OVS_KEY_ FIELDS, we should consider + * updating this function. + */ + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + ovs_tun_key_attr_size() + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + +static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, +}; + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, + [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) }, + [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) }, + [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) }, + [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) }, + [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) }, + [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) }, + [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) }, + [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) }, + [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) }, + [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) }, + [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, + .next = ovs_tunnel_key_lens, }, + [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, +}; + +static bool is_all_zero(const u8 *fp, size_t size) +{ + int i; + + if (!fp) + return false; + + for (i = 0; i < size; i++) + if (fp[i]) + return false; + + return true; +} + +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool log, bool nz) +{ + const struct nlattr *nla; + u64 attrs; + int rem; + + attrs = *attrsp; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR(log, "Key type %d is out of range max %d", + type, OVS_KEY_ATTR_MAX); + return -EINVAL; + } + + if (attrs & (1 << type)) { + OVS_NLERR(log, "Duplicate key (type %d).", type); + return -EINVAL; + } + + expected_len = ovs_key_lens[type].len; + if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + OVS_NLERR(log, "Key %d has unexpected len %d expected %d", + type, nla_len(nla), expected_len); + return -EINVAL; + } + + if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + attrs |= 1 << type; + a[type] = nla; + } + } + if (rem) { + OVS_NLERR(log, "Message has %d unknown bytes.", rem); + return -EINVAL; + } + + *attrsp = attrs; + return 0; +} + +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp, + bool log) +{ + return __parse_flow_nlattrs(attr, a, attrsp, log, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp, + bool log) +{ + return __parse_flow_nlattrs(attr, a, attrsp, log, false); +} + +static int genev_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "Geneve option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR(log, "Geneve option len %d != mask len %d", + match->key->tun_opts_len, nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + } + + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; +} + +static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, +}; + +static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + unsigned long opt_key_offset; + struct ovs_vxlan_opts opts; + int err; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); + if (err < 0) + return err; + + memset(&opts, 0, sizeof(opts)); + + if (tb[OVS_VXLAN_EXT_GBP]) + opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + return 0; +} + +static int ipv4_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + struct nlattr *a; + int rem; + bool ttl = false; + __be16 tun_flags = 0; + int opts_type = 0; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int err; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR(log, "Tunnel attr %d out of range max %d", + type, OVS_TUNNEL_KEY_ATTR_MAX); + return -EINVAL; + } + + if (ovs_tunnel_key_lens[type].len != nla_len(a) && + ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", + type, nla_len(a), ovs_tunnel_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + nla_get_in_addr(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + nla_get_in_addr(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + nla_get_u8(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + nla_get_u8(a), is_mask); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + tun_flags |= TUNNEL_DONT_FRAGMENT; + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + tun_flags |= TUNNEL_CSUM; + break; + case OVS_TUNNEL_KEY_ATTR_TP_SRC: + SW_FLOW_KEY_PUT(match, tun_key.tp_src, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TP_DST: + SW_FLOW_KEY_PUT(match, tun_key.tp_dst, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_OAM: + tun_flags |= TUNNEL_OAM; + break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = genev_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_GENEVE_OPT; + opts_type = type; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_VXLAN_OPT; + opts_type = type; + break; + default: + OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", + type); + return -EINVAL; + } + } + + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + + if (rem > 0) { + OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + rem); + return -EINVAL; + } + + if (!is_mask) { + if (!match->key->tun_key.ipv4_dst) { + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR(log, "IPv4 tunnel TTL not specified."); + return -EINVAL; + } + } + + return opts_type; +} + +static int vxlan_opt_to_nlattr(struct sk_buff *skb, + const void *tun_opts, int swkey_tun_opts_len) +{ + const struct ovs_vxlan_opts *opts = tun_opts; + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + +static int __ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const void *tun_opts, int swkey_tun_opts_len) +{ + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) + return -EMSGSIZE; + if (output->ipv4_src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->ipv4_src)) + return -EMSGSIZE; + if (output->ipv4_dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->ipv4_dst)) + return -EMSGSIZE; + if (output->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if (output->tp_src && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src)) + return -EMSGSIZE; + if (output->tp_dst && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (tun_opts) { + if (output->tun_flags & TUNNEL_GENEVE_OPT && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_VXLAN_OPT && + vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) + return -EMSGSIZE; + } + + return 0; +} + +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const void *tun_opts, int swkey_tun_opts_len) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + +int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, + const struct ovs_tunnel_info *egress_tun_info) +{ + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, + egress_tun_info->options, + egress_tun_info->options_len); +} + +static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, + const struct nlattr **a, bool is_mask, + bool log) +{ + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + + if (is_mask) { + in_port = 0xffffffff; /* Always exact match in_port. */ + } else if (in_port >= DP_MAX_PORTS) { + OVS_NLERR(log, "Port %d exceeds max allowable %d", + in_port, DP_MAX_PORTS); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); + } + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + return 0; +} + +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, + const struct nlattr **a, bool is_mask, + bool log) +{ + int err; + + err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); + if (err) + return err; + + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } + + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { + __be16 tci; + + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); + else + OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); + + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + OVS_NLERR(log, "EtherType %x is less than min %x", + ntohs(eth_type), ETH_P_802_3_MIN); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { + const struct ovs_key_ipv4 *ipv4_key; + + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR(log, "IPv4 frag type %d is out of range max %d", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR(log, "IPv6 frag type %d is out of range max %d", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + + if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { + OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x).\n", + ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { + const struct ovs_key_arp *arp_key; + + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).", + arp_key->arp_op); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } + + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + SW_FLOW_KEY_PUT(match, mpls.top_lse, + mpls_key->mpls_lse, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) { + OVS_NLERR(log, "Unknown key attributes %llx", + (unsigned long long)attrs); + return -EINVAL; + } + + return 0; +} + +static void nlattr_set(struct nlattr *attr, u8 val, + const struct ovs_len_tbl *tbl) +{ + struct nlattr *nla; + int rem; + + /* The nlattr stream should already have been validated */ + nla_for_each_nested(nla, attr, rem) { + if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) + nlattr_set(nla, val, tbl[nla_type(nla)].next); + else + memset(nla_data(nla), val, nla_len(nla)); + } +} + +static void mask_set_nlattr(struct nlattr *attr, u8 val) +{ + nlattr_set(attr, val, ovs_key_lens); +} + +/** + * ovs_nla_get_match - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. + */ +int ovs_nla_get_match(struct sw_flow_match *match, + const struct nlattr *nla_key, + const struct nlattr *nla_mask, + bool log) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + struct nlattr *newmask = NULL; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR(log, "Invalid Vlan frame."); + return -EINVAL; + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs, log); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); + return -EINVAL; + } + } else { + OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); + if (err) + return err; + + if (match->mask) { + if (!nla_mask) { + /* Create an exact match mask. We need to set to 0xff + * all the 'match->mask' fields that have been touched + * in 'match->key'. We cannot simply memset + * 'match->mask', because padding bytes and fields not + * specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, + * copied from 'key' and set to 0xff. + * ovs_key_from_nlattrs() will take care of filling + * 'match->mask' appropriately. + */ + newmask = kmemdup(nla_key, + nla_total_size(nla_len(nla_key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; + + mask_set_nlattr(newmask, 0xff); + + /* The userspace does not send tunnel attributes that + * are 0, but we should not wildcard them nonetheless. + */ + if (match->key->tun_key.ipv4_dst) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, + 0xff, true); + + nla_mask = newmask; + } + + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log); + if (err) + goto free_newmask; + + /* Always match on tci. */ + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); + err = -EINVAL; + goto free_newmask; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, + &mask_attrs, log); + if (err) + goto free_newmask; + } else { + OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", + ntohs(eth_type)); + err = -EINVAL; + goto free_newmask; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", + ntohs(tci)); + err = -EINVAL; + goto free_newmask; + } + } + + err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); + if (err) + goto free_newmask; + } + + if (!match_validate(match, key_attrs, mask_attrs, log)) + err = -EINVAL; + +free_newmask: + kfree(newmask); + return err; +} + +static size_t get_ufid_len(const struct nlattr *attr, bool log) +{ + size_t len; + + if (!attr) + return 0; + + len = nla_len(attr); + if (len < 1 || len > MAX_UFID_LENGTH) { + OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)", + nla_len(attr), MAX_UFID_LENGTH); + return 0; + } + + return len; +} + +/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID, + * or false otherwise. + */ +bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr, + bool log) +{ + sfid->ufid_len = get_ufid_len(attr, log); + if (sfid->ufid_len) + memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len); + + return sfid->ufid_len; +} + +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log) +{ + struct sw_flow_key *new_key; + + if (ovs_nla_get_ufid(sfid, ufid, log)) + return 0; + + /* If UFID was not provided, use unmasked key. */ + new_key = kmalloc(sizeof(*new_key), GFP_KERNEL); + if (!new_key) + return -ENOMEM; + memcpy(new_key, key, sizeof(*key)); + sfid->unmasked_key = new_key; + + return 0; +} + +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) +{ + return attr ? nla_get_u32(attr) : 0; +} + +/** + * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. + * @key: Receives extracted in_port, priority, tun_key and skb_mark. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ + +int ovs_nla_get_flow_metadata(const struct nlattr *attr, + struct sw_flow_key *key, + bool log) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + struct sw_flow_match match; + u64 attrs = 0; + int err; + + err = parse_flow_nlattrs(attr, a, &attrs, log); + if (err) + return -EINVAL; + + memset(&match, 0, sizeof(match)); + match.key = key; + + key->phy.in_port = DP_MAX_PORTS; + + return metadata_from_nlattrs(&match, &attrs, a, false, log); +} + +static int __ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, bool is_mask, + struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla, *encap; + + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) + goto nla_put_failure; + + if ((swkey->tun_key.ipv4_dst || is_mask)) { + const void *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); + + if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len)) + goto nla_put_failure; + } + + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; + + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) + goto nla_put_failure; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + + eth_key = nla_data(nla); + ether_addr_copy(eth_key->eth_src, output->eth.src); + ether_addr_copy(eth_key->eth_dst, output->eth.dst); + + if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + goto nla_put_failure; + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.tci) + goto unencap; + } else + encap = NULL; + + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; + goto unencap; + } + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) + goto nla_put_failure; + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); + ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = output->mpls.top_lse; + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + tcp_key->tcp_src = output->tp.src; + tcp_key->tcp_dst = output->tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->tp.flags)) + goto nla_put_failure; + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + udp_key->udp_src = output->tp.src; + udp_key->udp_dst = output->tp.dst; + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + sctp_key->sctp_src = output->tp.src; + sctp_key->sctp_dst = output->tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(output->tp.src); + icmp_key->icmp_code = ntohs(output->tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(output->tp.src); + icmpv6_key->icmpv6_code = ntohs(output->tp.dst); + + if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &output->ipv6.nd.target, + sizeof(nd_key->nd_target)); + ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); + ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); + } + } + } + +unencap: + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +int ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, int attr, bool is_mask, + struct sk_buff *skb) +{ + int err; + struct nlattr *nla; + + nla = nla_nest_start(skb, attr); + if (!nla) + return -EMSGSIZE; + err = __ovs_nla_put_key(swkey, output, is_mask, skb); + if (err) + return err; + nla_nest_end(skb, nla); + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len, + flow->id.ufid); + + return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->mask->key, + OVS_FLOW_ATTR_MASK, true, skb); +} + +#define MAX_ACTIONS_BUFSIZE (32 * 1024) + +static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) +{ + struct sw_flow_actions *sfa; + + if (size > MAX_ACTIONS_BUFSIZE) { + OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); + return ERR_PTR(-EINVAL); + } + + sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = 0; + return sfa; +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + kfree_rcu(sf_acts, rcu); +} + +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, + int attr_len, bool log) +{ + + struct sw_flow_actions *acts; + int new_acts_size; + int req_size = NLA_ALIGN(attr_len); + int next_offset = offsetof(struct sw_flow_actions, actions) + + (*sfa)->actions_len; + + if (req_size <= (ksize(*sfa) - next_offset)) + goto out; + + new_acts_size = ksize(*sfa) * 2; + + if (new_acts_size > MAX_ACTIONS_BUFSIZE) { + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + return ERR_PTR(-EMSGSIZE); + new_acts_size = MAX_ACTIONS_BUFSIZE; + } + + acts = nla_alloc_flow_actions(new_acts_size, log); + if (IS_ERR(acts)) + return (void *)acts; + + memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); + acts->actions_len = (*sfa)->actions_len; + kfree(*sfa); + *sfa = acts; + +out: + (*sfa)->actions_len += req_size; + return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len, bool log) +{ + struct nlattr *a; + + a = reserve_sfa_size(sfa, nla_attr_size(len), log); + if (IS_ERR(a)) + return a; + + a->nla_type = attrtype; + a->nla_len = nla_attr_size(len); + + if (data) + memcpy(nla_data(a), data, len); + memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + + return a; +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len, log); + + return PTR_ERR_OR_ZERO(a); +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, + int attrtype, bool log) +{ + int used = (*sfa)->actions_len; + int err; + + err = add_action(sfa, attrtype, NULL, 0, log); + if (err) + return err; + + return used; +} + +static inline void add_nested_action_end(struct sw_flow_actions *sfa, + int st_offset) +{ + struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + + st_offset); + + a->nla_len = sfa->actions_len - st_offset; +} + +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log); + +static int validate_and_copy_sample(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem, start, err, st_acts; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy sample action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); + if (start < 0) + return start; + err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32), log); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); + if (st_acts < 0) + return st_acts; + + err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; +} + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static int validate_geneve_opts(struct sw_flow_key *key) +{ + struct geneve_opt *option; + int opts_len = key->tun_opts_len; + bool crit_opt = false; + + option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len); + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + + return 0; +} + +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa, bool log) +{ + struct sw_flow_match match; + struct sw_flow_key key; + struct ovs_tunnel_info *tun_info; + struct nlattr *a; + int err = 0, start, opts_type; + + ovs_match_init(&match, &key, NULL); + opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); + if (opts_type < 0) + return opts_type; + + if (key.tun_opts_len) { + switch (opts_type) { + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + err = validate_geneve_opts(&key); + if (err < 0) + return err; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + break; + } + }; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); + if (start < 0) + return start; + + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*tun_info) + key.tun_opts_len, log); + if (IS_ERR(a)) + return PTR_ERR(a); + + tun_info = nla_data(a); + tun_info->tunnel = key.tun_key; + tun_info->options_len = key.tun_opts_len; + + if (tun_info->options_len) { + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + memcpy((tun_info + 1), + TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); + tun_info->options = (tun_info + 1); + } else { + tun_info->options = NULL; + } + + add_nested_action_end(*sfa, start); + + return err; +} + +/* Return false if there are any non-masked bits set. + * Mask follows data immediately, before any netlink padding. + */ +static bool validate_masked(u8 *data, int len) +{ + u8 *mask = data + len; + + while (len--) + if (*data++ & ~*mask++) + return false; + + return true; +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, + bool *skip_copy, __be16 eth_type, bool masked, bool log) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + size_t key_len; + + /* There can be only one key in a action */ + if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + key_len = nla_len(ovs_key); + if (masked) + key_len /= 2; + + if (key_type > OVS_KEY_ATTR_MAX || + (ovs_key_lens[key_type].len != key_len && + ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + return -EINVAL; + + if (masked && !validate_masked(nla_data(ovs_key), key_len)) + return -EINVAL; + + switch (key_type) { + const struct ovs_key_ipv4 *ipv4_key; + const struct ovs_key_ipv6 *ipv6_key; + int err; + + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_ETHERNET: + break; + + case OVS_KEY_ATTR_TUNNEL: + if (eth_p_mpls(eth_type)) + return -EINVAL; + + if (masked) + return -EINVAL; /* Masked tunnel set not supported. */ + + *skip_copy = true; + err = validate_and_copy_set_tun(a, sfa, log); + if (err) + return err; + break; + + case OVS_KEY_ATTR_IPV4: + if (eth_type != htons(ETH_P_IP)) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + + if (masked) { + const struct ovs_key_ipv4 *mask = ipv4_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv4_proto || mask->ipv4_frag) + return -EINVAL; + } else { + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + } + break; + + case OVS_KEY_ATTR_IPV6: + if (eth_type != htons(ETH_P_IPV6)) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + + if (masked) { + const struct ovs_key_ipv6 *mask = ipv6_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv6_proto || mask->ipv6_frag) + return -EINVAL; + + /* Invalid bits in the flow label mask? */ + if (ntohl(mask->ipv6_label) & 0xFFF00000) + return -EINVAL; + } else { + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + } + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_TCP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_UDP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; + + case OVS_KEY_ATTR_SCTP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + break; + + default: + return -EINVAL; + } + + /* Convert non-masked non-tunnel set actions to masked set actions. */ + if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) { + int start, len = key_len * 2; + struct nlattr *at; + + *skip_copy = true; + + start = add_nested_action_start(sfa, + OVS_ACTION_ATTR_SET_TO_MASKED, + log); + if (start < 0) + return start; + + at = __add_action(sfa, key_type, NULL, len, log); + if (IS_ERR(at)) + return PTR_ERR(at); + + memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */ + memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */ + /* Clear non-writeable bits from otherwise writeable fields. */ + if (key_type == OVS_KEY_ATTR_IPV6) { + struct ovs_key_ipv6 *mask = nla_data(at) + key_len; + + mask->ipv6_label &= htonl(0x000FFFFF); + } + add_nested_action_end(*sfa, start); + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, + attr, userspace_policy); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static int copy_action(const struct nlattr *from, + struct sw_flow_actions **sfa, bool log) +{ + int totlen = NLA_ALIGN(from->nla_len); + struct nlattr *to; + + to = reserve_sfa_size(sfa, from->nla_len, log); + if (IS_ERR(to)) + return PTR_ERR(to); + + memcpy(to, from, totlen); + return 0; +} + +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log) +{ + const struct nlattr *a; + int rem, err; + + if (depth >= SAMPLE_ACTION_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + bool skip_copy; + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + skip_copy = false; + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } + + case OVS_ACTION_ATTR_POP_VLAN: + vlan_tci = htons(0); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + vlan = nla_data(a); + if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + return -EINVAL; + vlan_tci = vlan->vlan_tci; + break; + + case OVS_ACTION_ATTR_RECIRC: + break; + + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions is planned using packet + * recirculation. + */ + eth_type = htons(0); + break; + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key, sfa, + &skip_copy, eth_type, false, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_MASKED: + err = validate_set(a, key, sfa, + &skip_copy, eth_type, true, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = validate_and_copy_sample(a, key, depth, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + skip_copy = true; + break; + + default: + OVS_NLERR(log, "Unknown Action type %d", type); + return -EINVAL; + } + if (!skip_copy) { + err = copy_action(a, sfa, log); + if (err) + return err; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +/* 'key' must be the masked key. */ +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + int err; + + *sfa = nla_alloc_flow_actions(nla_len(attr), log); + if (IS_ERR(*sfa)) + return PTR_ERR(*sfa); + + err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + key->eth.tci, log); + if (err) + kfree(*sfa); + + return err; +} + +static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_sample; + + switch (type) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_SAMPLE_ATTR_ACTIONS: + st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!st_sample) + return -EMSGSIZE; + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_sample); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + tun_info->options_len ? + tun_info->options : NULL, + tun_info->options_len); + if (err) + return err; + nla_nest_end(skb, start); + break; + } + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + +static int masked_set_action_to_set_action_attr(const struct nlattr *a, + struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + struct nlattr *nla; + size_t key_len = nla_len(ovs_key) / 2; + + /* Revert the conversion we did from a non-masked set action to + * masked set action. + */ + nla = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!nla) + return -EMSGSIZE; + + if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key))) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + +int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) +{ + const struct nlattr *a; + int rem, err; + + nla_for_each_attr(a, attr, len, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = masked_set_action_to_set_action_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample_action_to_attr(a, skb); + if (err) + return err; + break; + default: + if (nla_put(skb, type, nla_len(a), nla_data(a))) + return -EMSGSIZE; + break; + } + } + + return 0; +} diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h new file mode 100644 index 000000000..5c3d75bff --- /dev/null +++ b/net/openvswitch/flow_netlink.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + + +#ifndef FLOW_NETLINK_H +#define FLOW_NETLINK_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "flow.h" + +size_t ovs_tun_key_attr_size(void); +size_t ovs_key_attr_size(void); + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + +int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, + int attr, bool is_mask, struct sk_buff *); +int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, + bool log); + +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); + +int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, + const struct nlattr *mask, bool log); +int ovs_nla_put_egress_tunnel_key(struct sk_buff *, + const struct ovs_tunnel_info *); + +bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log); +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); + +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log); +int ovs_nla_put_actions(const struct nlattr *attr, + int len, struct sk_buff *skb); + +void ovs_nla_free_flow_actions(struct sw_flow_actions *); + +#endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c new file mode 100644 index 000000000..4613df8c8 --- /dev/null +++ b/net/openvswitch/flow_table.c @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TBL_MIN_BUCKETS 1024 +#define REHASH_INTERVAL (10 * 60 * HZ) + +static struct kmem_cache *flow_cache; +struct kmem_cache *flow_stats_cache __read_mostly; + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask) +{ + const long *m = (const long *)((const u8 *)&mask->key + + mask->range.start); + const long *s = (const long *)((const u8 *)src + + mask->range.start); + long *d = (long *)((u8 *)dst + mask->range.start); + int i; + + /* The memory outside of the 'mask->range' are not set since + * further operations on 'dst' only uses contents within + * 'mask->range'. + */ + for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + *d++ = *s++ & *m++; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + struct flow_stats *stats; + int node; + + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + flow->sf_acts = NULL; + flow->mask = NULL; + flow->id.unmasked_key = NULL; + flow->id.ufid_len = 0; + flow->stats_last_writer = NUMA_NO_NODE; + + /* Initialize the default stat node. */ + stats = kmem_cache_alloc_node(flow_stats_cache, + GFP_KERNEL | __GFP_ZERO, 0); + if (!stats) + goto err; + + spin_lock_init(&stats->lock); + + RCU_INIT_POINTER(flow->stats[0], stats); + + for_each_node(node) + if (node != 0) + RCU_INIT_POINTER(flow->stats[node], NULL); + + return flow; +err: + kmem_cache_free(flow_cache, flow); + return ERR_PTR(-ENOMEM); +} + +int ovs_flow_tbl_count(const struct flow_table *table) +{ + return table->count; +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void flow_free(struct sw_flow *flow) +{ + int node; + + if (ovs_identifier_is_key(&flow->id)) + kfree(flow->id.unmasked_key); + kfree((struct sw_flow_actions __force *)flow->sf_acts); + for_each_node(node) + if (flow->stats[node]) + kmem_cache_free(flow_stats_cache, + (struct flow_stats __force *)flow->stats[node]); + kmem_cache_free(flow_cache, flow); +} + +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + flow_free(flow); +} + +void ovs_flow_free(struct sw_flow *flow, bool deferred) +{ + if (!flow) + return; + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + flow_free(flow); +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + + +static void __table_instance_destroy(struct table_instance *ti) +{ + free_buckets(ti->buckets); + kfree(ti); +} + +static struct table_instance *table_instance_alloc(int new_size) +{ + struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (!ti) + return NULL; + + ti->buckets = alloc_buckets(new_size); + + if (!ti->buckets) { + kfree(ti); + return NULL; + } + ti->n_buckets = new_size; + ti->node_ver = 0; + ti->keep_flows = false; + get_random_bytes(&ti->hash_seed, sizeof(u32)); + + return ti; +} + +int ovs_flow_tbl_init(struct flow_table *table) +{ + struct table_instance *ti, *ufid_ti; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); + + if (!ti) + return -ENOMEM; + + ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!ufid_ti) + goto free_ti; + + rcu_assign_pointer(table->ti, ti); + rcu_assign_pointer(table->ufid_ti, ufid_ti); + INIT_LIST_HEAD(&table->mask_list); + table->last_rehash = jiffies; + table->count = 0; + table->ufid_count = 0; + return 0; + +free_ti: + __table_instance_destroy(ti); + return -ENOMEM; +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct table_instance *ti = container_of(rcu, struct table_instance, rcu); + + __table_instance_destroy(ti); +} + +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti, + bool deferred) +{ + int i; + + if (!ti) + return; + + BUG_ON(!ufid_ti); + if (ti->keep_flows) + goto skip_flows; + + for (i = 0; i < ti->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(ti->buckets, i); + struct hlist_node *n; + int ver = ti->node_ver; + int ufid_ver = ufid_ti->node_ver; + + hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { + hlist_del_rcu(&flow->flow_table.node[ver]); + if (ovs_identifier_is_ufid(&flow->id)) + hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); + ovs_flow_free(flow, deferred); + } + } + +skip_flows: + if (deferred) { + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); + } else { + __table_instance_destroy(ti); + __table_instance_destroy(ufid_ti); + } +} + +/* No need for locking this function is called from RCU callback or + * error path. + */ +void ovs_flow_tbl_destroy(struct flow_table *table) +{ + struct table_instance *ti = rcu_dereference_raw(table->ti); + struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + + table_instance_destroy(ti, ufid_ti, false); +} + +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, + u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = ti->node_ver; + while (*bucket < ti->n_buckets) { + i = 0; + head = flex_array_get(ti->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) +{ + hash = jhash_1word(hash, ti->hash_seed); + return flex_array_get(ti->buckets, + (hash & (ti->n_buckets - 1))); +} + +static void table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->flow_table.hash); + hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); +} + +static void ufid_table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->ufid_table.hash); + hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); +} + +static void flow_table_copy_flows(struct table_instance *old, + struct table_instance *new, bool ufid) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + if (ufid) + hlist_for_each_entry(flow, head, + ufid_table.node[old_ver]) + ufid_table_instance_insert(new, flow); + else + hlist_for_each_entry(flow, head, + flow_table.node[old_ver]) + table_instance_insert(new, flow); + } + + old->keep_flows = true; +} + +static struct table_instance *table_instance_rehash(struct table_instance *ti, + int n_buckets, bool ufid) +{ + struct table_instance *new_ti; + + new_ti = table_instance_alloc(n_buckets); + if (!new_ti) + return NULL; + + flow_table_copy_flows(ti, new_ti, ufid); + + return new_ti; +} + +int ovs_flow_tbl_flush(struct flow_table *flow_table) +{ + struct table_instance *old_ti, *new_ti; + struct table_instance *old_ufid_ti, *new_ufid_ti; + + new_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ti) + return -ENOMEM; + new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ufid_ti) + goto err_free_ti; + + old_ti = ovsl_dereference(flow_table->ti); + old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); + + rcu_assign_pointer(flow_table->ti, new_ti); + rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); + flow_table->last_rehash = jiffies; + flow_table->count = 0; + flow_table->ufid_count = 0; + + table_instance_destroy(old_ti, old_ufid_ti, true); + return 0; + +err_free_ti: + __table_instance_destroy(new_ti); + return -ENOMEM; +} + +static u32 flow_hash(const struct sw_flow_key *key, + const struct sw_flow_key_range *range) +{ + int key_start = range->start; + int key_end = range->end; + const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); + int hash_u32s = (key_end - key_start) >> 2; + + /* Make sure number of hash bytes are multiple of u32. */ + BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + + return jhash2(hash_key, hash_u32s, 0); +} + +static int flow_key_start(const struct sw_flow_key *key) +{ + if (key->tun_key.ipv4_dst) + return 0; + else + return rounddown(offsetof(struct sw_flow_key, phy), + sizeof(long)); +} + +static bool cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, + int key_start, int key_end) +{ + const long *cp1 = (const long *)((const u8 *)key1 + key_start); + const long *cp2 = (const long *)((const u8 *)key2 + key_start); + long diffs = 0; + int i; + + for (i = key_start; i < key_end; i += sizeof(long)) + diffs |= *cp1++ ^ *cp2++; + + return diffs == 0; +} + +static bool flow_cmp_masked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, + const struct sw_flow_key_range *range) +{ + return cmp_key(&flow->key, key, range->start, range->end); +} + +static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_match *match) +{ + struct sw_flow_key *key = match->key; + int key_start = flow_key_start(key); + int key_end = match->range.end; + + BUG_ON(ovs_identifier_is_ufid(&flow->id)); + return cmp_key(flow->id.unmasked_key, key, key_start, key_end); +} + +static struct sw_flow *masked_flow_lookup(struct table_instance *ti, + const struct sw_flow_key *unmasked, + const struct sw_flow_mask *mask) +{ + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + struct sw_flow_key masked_key; + + ovs_flow_mask_key(&masked_key, unmasked, mask); + hash = flow_hash(&masked_key, &mask->range); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { + if (flow->mask == mask && flow->flow_table.hash == hash && + flow_cmp_masked_key(flow, &masked_key, &mask->range)) + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 *n_mask_hit) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + *n_mask_hit = 0; + list_for_each_entry_rcu(mask, &tbl->mask_list, list) { + (*n_mask_hit)++; + flow = masked_flow_lookup(ti, key, mask); + if (flow) /* Found */ + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + u32 __always_unused n_mask_hit; + + return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); +} + +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + const struct sw_flow_match *match) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + /* Always called under ovs-mutex. */ + list_for_each_entry(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, match->key, mask); + if (flow && ovs_identifier_is_key(&flow->id) && + ovs_flow_cmp_unmasked_key(flow, match)) + return flow; + } + return NULL; +} + +static u32 ufid_hash(const struct sw_flow_id *sfid) +{ + return jhash(sfid->ufid, sfid->ufid_len, 0); +} + +static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, + const struct sw_flow_id *sfid) +{ + if (flow->id.ufid_len != sfid->ufid_len) + return false; + + return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); +} + +bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return flow_cmp_masked_key(flow, match->key, &match->range); + + return ovs_flow_cmp_unmasked_key(flow, match); +} + +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, + const struct sw_flow_id *ufid) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + + hash = ufid_hash(ufid); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) { + if (flow->ufid_table.hash == hash && + ovs_flow_cmp_ufid(flow, ufid)) + return flow; + } + return NULL; +} + +int ovs_flow_tbl_num_masks(const struct flow_table *table) +{ + struct sw_flow_mask *mask; + int num = 0; + + list_for_each_entry(mask, &table->mask_list, list) + num++; + + return num; +} + +static struct table_instance *table_instance_expand(struct table_instance *ti, + bool ufid) +{ + return table_instance_rehash(ti, ti->n_buckets * 2, ufid); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + kfree_rcu(mask, rcu); + } + } +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); + + BUG_ON(table->count == 0); + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + table->count--; + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + table->ufid_count--; + } + + /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be + * accessible as long as the RCU read lock is held. + */ + flow_mask_remove(table, flow->mask); +} + +static struct sw_flow_mask *mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 1; + + return mask; +} + +static bool mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + const u8 *a_ = (const u8 *)&a->key + a->range.start; + const u8 *b_ = (const u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, &tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/* Add 'mask' into the mask list, if it is not already there. */ +static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, + const struct sw_flow_mask *new) +{ + struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); + if (!mask) { + /* Allocate a new mask if none exsits. */ + mask = mask_alloc(); + if (!mask) + return -ENOMEM; + mask->key = new->key; + mask->range = new->range; + list_add_rcu(&mask->list, &tbl->mask_list); + } else { + BUG_ON(!mask->ref_count); + mask->ref_count++; + } + + flow->mask = mask; + return 0; +} + +/* Must be called with OVS mutex held. */ +static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *new_ti = NULL; + struct table_instance *ti; + + flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); + ti = ovsl_dereference(table->ti); + table_instance_insert(ti, flow); + table->count++; + + /* Expand table, if necessary, to make room. */ + if (table->count > ti->n_buckets) + new_ti = table_instance_expand(ti, false); + else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) + new_ti = table_instance_rehash(ti, ti->n_buckets, false); + + if (new_ti) { + rcu_assign_pointer(table->ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + table->last_rehash = jiffies; + } +} + +/* Must be called with OVS mutex held. */ +static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti; + + flow->ufid_table.hash = ufid_hash(&flow->id); + ti = ovsl_dereference(table->ufid_ti); + ufid_table_instance_insert(ti, flow); + table->ufid_count++; + + /* Expand table, if necessary, to make room. */ + if (table->ufid_count > ti->n_buckets) { + struct table_instance *new_ti; + + new_ti = table_instance_expand(ti, true); + if (new_ti) { + rcu_assign_pointer(table->ufid_ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + } + } +} + +/* Must be called with OVS mutex held. */ +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask) +{ + int err; + + err = flow_mask_insert(table, flow, mask); + if (err) + return err; + flow_key_insert(table, flow); + if (ovs_identifier_is_ufid(&flow->id)) + flow_ufid_insert(table, flow); + + return 0; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); + BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + + (num_possible_nodes() + * sizeof(struct flow_stats *)), + 0, 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + flow_stats_cache + = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (flow_stats_cache == NULL) { + kmem_cache_destroy(flow_cache); + flow_cache = NULL; + return -ENOMEM; + } + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_stats_cache); + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h new file mode 100644 index 000000000..616eda10d --- /dev/null +++ b/net/openvswitch/flow_table.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "flow.h" + +struct table_instance { + struct flex_array *buckets; + unsigned int n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + bool keep_flows; +}; + +struct flow_table { + struct table_instance __rcu *ti; + struct table_instance __rcu *ufid_ti; + struct list_head mask_list; + unsigned long last_rehash; + unsigned int count; + unsigned int ufid_count; +}; + +extern struct kmem_cache *flow_stats_cache; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_free(struct sw_flow *, bool deferred); + +int ovs_flow_tbl_init(struct flow_table *); +int ovs_flow_tbl_count(const struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table); +int ovs_flow_tbl_flush(struct flow_table *flow_table); + +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +int ovs_flow_tbl_num_masks(const struct flow_table *table); +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, + u32 *bucket, u32 *idx); +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, + const struct sw_flow_key *, + u32 *n_mask_hit); +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + const struct sw_flow_match *match); +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, + const struct sw_flow_id *); + +bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask); +#endif /* flow_table.h */ diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000..bf02fd580 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +static struct vport_ops ovs_geneve_vport_ops; + +/** + * struct geneve_port - Keeps track of open UDP ports + * @gs: The socket created for this port number. + * @name: vport name. + */ +struct geneve_port { + struct geneve_sock *gs; + char name[IFNAMSIZ]; +}; + +static LIST_HEAD(geneve_ports); + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit VNI to 64 bit tunnel ID. */ +static __be64 vni_to_tunnel_id(const __u8 *vni) +{ +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct vport *vport = gs->rcv_data; + struct genevehdr *geneveh = geneve_hdr(skb); + int opts_len; + struct ovs_tunnel_info tun_info; + __be64 key; + __be16 flags; + + opts_len = geneveh->opt_len * 4; + + flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | + (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | + (geneveh->oam ? TUNNEL_OAM : 0) | + (geneveh->critical ? TUNNEL_CRIT_OPT : 0); + + key = vni_to_tunnel_id(geneveh->vni); + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, + geneveh->options, opts_len); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + return -EMSGSIZE; + return 0; +} + +static void geneve_tnl_destroy(struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + geneve_sock_release(geneve_port->gs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct geneve_sock *gs; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + strncpy(geneve_port->name, parms->name, IFNAMSIZ); + + gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); + if (IS_ERR(gs)) { + ovs_vport_free(vport); + return (void *)gs; + } + geneve_port->gs = gs; + + return vport; +error: + return ERR_PTR(err); +} + +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + const struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *tun_info; + struct net *net = ovs_dp_get_net(vport->dp); + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport; + struct rtable *rt; + struct flowi4 fl; + u8 vni[3], opts_len, *opts; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + tunnel_id_to_vni(tun_key->tun_id, vni); + skb->ignore_df = 1; + + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { + opts = (u8 *)tun_info->options; + opts_len = tun_info->options_len; + } else { + opts = NULL; + opts_len = 0; + } + + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, + tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->ipv4_ttl, df, sport, dport, + tun_key->tun_flags, vni, opts_len, opts, + !!(tun_key->tun_flags & TUNNEL_CSUM), false); + if (err < 0) + ip_rt_put(rt); + return err; + +error: + kfree_skb(skb); + return err; +} + +static const char *geneve_get_name(const struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + return geneve_port->name; +} + +static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + + /* Get tp_src and tp_dst, refert to geneve_build_header(). + */ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, sport, dport); +} + +static struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_tnl_create, + .destroy = geneve_tnl_destroy, + .get_name = geneve_get_name, + .get_options = geneve_get_options, + .send = geneve_tnl_send, + .owner = THIS_MODULE, + .get_egress_tun_info = geneve_get_egress_tun_info, +}; + +static int __init ovs_geneve_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_geneve_vport_ops); +} + +static void __exit ovs_geneve_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_geneve_vport_ops); +} + +module_init(ovs_geneve_tnl_init); +module_exit(ovs_geneve_tnl_exit); + +MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-5"); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c new file mode 100644 index 000000000..f17ac9642 --- /dev/null +++ b/net/openvswitch/vport-gre.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +static struct vport_ops ovs_gre_vport_ops; + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 be64_get_low32(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif +} + +static __be16 filter_tnl_flags(__be16 flags) +{ + return flags & (TUNNEL_CSUM | TUNNEL_KEY); +} + +static struct sk_buff *__build_header(struct sk_buff *skb, + int tunnel_hlen) +{ + struct tnl_ptk_info tpi; + const struct ovs_key_ipv4_tunnel *tun_key; + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + + skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) + return skb; + + tpi.flags = filter_tnl_flags(tun_key->tun_flags); + tpi.proto = htons(ETH_P_TEB); + tpi.key = be64_get_low32(tun_key->tun_id); + tpi.seq = 0; + gre_build_header(skb, &tpi, tunnel_hlen); + + return skb; +} + +static __be64 key_to_tunnel_id(__be32 key, __be32 seq) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u64)seq << 32 | (__force u32)key); +#else + return (__force __be64)((__force u64)key << 32 | (__force u32)seq); +#endif +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_rcv(struct sk_buff *skb, + const struct tnl_ptk_info *tpi) +{ + struct ovs_tunnel_info tun_info; + struct ovs_net *ovs_net; + struct vport *vport; + __be64 key; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + if (unlikely(!vport)) + return PACKET_REJECT; + + key = key_to_tunnel_id(tpi->key, tpi->seq); + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, + filter_tnl_flags(tpi->flags), NULL, 0); + + ovs_vport_receive(vport, skb, &tun_info); + return PACKET_RCVD; +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) +{ + struct ovs_net *ovs_net; + struct vport *vport; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + + if (unlikely(!vport)) + return PACKET_REJECT; + else + return PACKET_RCVD; +} + +static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + const struct ovs_key_ipv4_tunnel *tun_key; + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df; + int err; + + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { + err = -EINVAL; + goto err_free_skb; + } + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err_free_skb; + } + + tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + tunnel_hlen + sizeof(struct iphdr) + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) { + err = -ENOMEM; + goto err_free_rt; + } + + /* Push Tunnel header. */ + skb = __build_header(skb, tunnel_hlen); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + skb = NULL; + goto err_free_rt; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->ignore_df = 1; + + return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, + tun_key->ipv4_dst, IPPROTO_GRE, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + return err; +} + +static struct gre_cisco_protocol gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .priority = 1, +}; + +static int gre_ports; +static int gre_init(void) +{ + int err; + + gre_ports++; + if (gre_ports > 1) + return 0; + + err = gre_cisco_register(&gre_protocol); + if (err) + pr_warn("cannot register gre protocol handler\n"); + + return err; +} + +static void gre_exit(void) +{ + gre_ports--; + if (gre_ports > 0) + return; + + gre_cisco_unregister(&gre_protocol); +} + +static const char *gre_get_name(const struct vport *vport) +{ + return vport_priv(vport); +} + +static struct vport *gre_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct ovs_net *ovs_net; + struct vport *vport; + int err; + + err = gre_init(); + if (err) + return ERR_PTR(err); + + ovs_net = net_generic(net, ovs_net_id); + if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { + vport = ERR_PTR(-EEXIST); + goto error; + } + + vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + goto error; + + strncpy(vport_priv(vport), parms->name, IFNAMSIZ); + rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); + return vport; + +error: + gre_exit(); + return vport; +} + +static void gre_tnl_destroy(struct vport *vport) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct ovs_net *ovs_net; + + ovs_net = net_generic(net, ovs_net_id); + + RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); + ovs_vport_deferred_free(vport); + gre_exit(); +} + +static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_GRE, skb->mark, 0, 0); +} + +static struct vport_ops ovs_gre_vport_ops = { + .type = OVS_VPORT_TYPE_GRE, + .create = gre_create, + .destroy = gre_tnl_destroy, + .get_name = gre_get_name, + .send = gre_tnl_send, + .get_egress_tun_info = gre_get_egress_tun_info, + .owner = THIS_MODULE, +}; + +static int __init ovs_gre_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_gre_vport_ops); +} + +static void __exit ovs_gre_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_gre_vport_ops); +} + +module_init(ovs_gre_tnl_init); +module_exit(ovs_gre_tnl_exit); + +MODULE_DESCRIPTION("OVS: GRE switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-3"); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c new file mode 100644 index 000000000..6a55f7105 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +struct internal_dev { + struct vport *vport; +}; + +static struct vport_ops ovs_internal_vport_ops; + +static struct internal_dev *internal_dev_priv(struct net_device *netdev) +{ + return netdev_priv(netdev); +} + +/* This function is only called by the kernel network layer.*/ +static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct vport *vport = ovs_internal_dev_get_vport(netdev); + struct ovs_vport_stats vport_stats; + + ovs_vport_get_stats(vport, &vport_stats); + + /* The tx and rx stats need to be swapped because the + * switch and host OS have opposite perspectives. */ + stats->rx_packets = vport_stats.tx_packets; + stats->tx_packets = vport_stats.rx_packets; + stats->rx_bytes = vport_stats.tx_bytes; + stats->tx_bytes = vport_stats.rx_bytes; + stats->rx_errors = vport_stats.tx_errors; + stats->tx_errors = vport_stats.rx_errors; + stats->rx_dropped = vport_stats.tx_dropped; + stats->tx_dropped = vport_stats.rx_dropped; + + return stats; +} + +/* Called with rcu_read_lock_bh. */ +static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + rcu_read_lock(); + ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + rcu_read_unlock(); + return 0; +} + +static int internal_dev_open(struct net_device *netdev) +{ + netif_start_queue(netdev); + return 0; +} + +static int internal_dev_stop(struct net_device *netdev) +{ + netif_stop_queue(netdev); + return 0; +} + +static void internal_dev_getinfo(struct net_device *netdev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, "openvswitch", sizeof(info->driver)); +} + +static const struct ethtool_ops internal_dev_ethtool_ops = { + .get_drvinfo = internal_dev_getinfo, + .get_link = ethtool_op_get_link, +}; + +static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < 68) + return -EINVAL; + + netdev->mtu = new_mtu; + return 0; +} + +static void internal_dev_destructor(struct net_device *dev) +{ + struct vport *vport = ovs_internal_dev_get_vport(dev); + + ovs_vport_free(vport); + free_netdev(dev); +} + +static const struct net_device_ops internal_dev_netdev_ops = { + .ndo_open = internal_dev_open, + .ndo_stop = internal_dev_stop, + .ndo_start_xmit = internal_dev_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_change_mtu = internal_dev_change_mtu, + .ndo_get_stats64 = internal_dev_get_stats, +}; + +static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { + .kind = "openvswitch", +}; + +static void do_setup(struct net_device *netdev) +{ + ether_setup(netdev); + + netdev->netdev_ops = &internal_dev_netdev_ops; + + netdev->priv_flags &= ~IFF_TX_SKB_SHARING; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->destructor = internal_dev_destructor; + netdev->ethtool_ops = &internal_dev_ethtool_ops; + netdev->rtnl_link_ops = &internal_dev_link_ops; + netdev->tx_queue_len = 0; + + netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; + + netdev->vlan_features = netdev->features; + netdev->hw_enc_features = netdev->features; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + + eth_hw_addr_random(netdev); +} + +static struct vport *internal_dev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct netdev_vport *netdev_vport; + struct internal_dev *internal_dev; + int err; + + vport = ovs_vport_alloc(sizeof(struct netdev_vport), + &ovs_internal_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, + do_setup); + if (!netdev_vport->dev) { + err = -ENOMEM; + goto error_free_vport; + } + + dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(netdev_vport->dev); + internal_dev->vport = vport; + + /* Restrict bridge port to current netns. */ + if (vport->port_no == OVSP_LOCAL) + netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + + rtnl_lock(); + err = register_netdevice(netdev_vport->dev); + if (err) + goto error_free_netdev; + + dev_set_promiscuity(netdev_vport->dev, 1); + rtnl_unlock(); + netif_start_queue(netdev_vport->dev); + + return vport; + +error_free_netdev: + rtnl_unlock(); + free_netdev(netdev_vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void internal_dev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + netif_stop_queue(netdev_vport->dev); + rtnl_lock(); + dev_set_promiscuity(netdev_vport->dev, -1); + + /* unregister_netdevice() waits for an RCU grace period. */ + unregister_netdevice(netdev_vport->dev); + + rtnl_unlock(); +} + +static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +{ + struct net_device *netdev = netdev_vport_priv(vport)->dev; + int len; + + if (unlikely(!(netdev->flags & IFF_UP))) { + kfree_skb(skb); + return 0; + } + + len = skb->len; + + skb_dst_drop(skb); + nf_reset(skb); + secpath_reset(skb); + + skb->dev = netdev; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, netdev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + netif_rx(skb); + + return len; +} + +static struct vport_ops ovs_internal_vport_ops = { + .type = OVS_VPORT_TYPE_INTERNAL, + .create = internal_dev_create, + .destroy = internal_dev_destroy, + .get_name = ovs_netdev_get_name, + .send = internal_dev_recv, +}; + +int ovs_is_internal_dev(const struct net_device *netdev) +{ + return netdev->netdev_ops == &internal_dev_netdev_ops; +} + +struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) +{ + if (!ovs_is_internal_dev(netdev)) + return NULL; + + return internal_dev_priv(netdev)->vport; +} + +int ovs_internal_dev_rtnl_link_register(void) +{ + int err; + + err = rtnl_link_register(&internal_dev_link_ops); + if (err < 0) + return err; + + err = ovs_vport_ops_register(&ovs_internal_vport_ops); + if (err < 0) + rtnl_link_unregister(&internal_dev_link_ops); + + return err; +} + +void ovs_internal_dev_rtnl_link_unregister(void) +{ + ovs_vport_ops_unregister(&ovs_internal_vport_ops); + rtnl_link_unregister(&internal_dev_link_ops); +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h new file mode 100644 index 000000000..1b179a190 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_INTERNAL_DEV_H +#define VPORT_INTERNAL_DEV_H 1 + +#include "datapath.h" +#include "vport.h" + +int ovs_is_internal_dev(const struct net_device *); +struct vport *ovs_internal_dev_get_vport(struct net_device *); +int ovs_internal_dev_rtnl_link_register(void); +void ovs_internal_dev_rtnl_link_unregister(void); + +#endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c new file mode 100644 index 000000000..33e6d6e29 --- /dev/null +++ b/net/openvswitch/vport-netdev.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_netdev_vport_ops; + +/* Must be called with rcu_read_lock. */ +static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +{ + if (unlikely(!vport)) + goto error; + + if (unlikely(skb_warn_if_lro(skb))) + goto error; + + /* Make our own copy of the packet. Otherwise we will mangle the + * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return; + + skb_push(skb, ETH_HLEN); + ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + + ovs_vport_receive(vport, skb, NULL); + return; + +error: + kfree_skb(skb); +} + +/* Called with rcu_read_lock and bottom-halves disabled. */ +static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct vport *vport; + + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + + vport = ovs_netdev_get_vport(skb->dev); + + netdev_port_receive(vport, skb); + + return RX_HANDLER_CONSUMED; +} + +static struct net_device *get_dpdev(const struct datapath *dp) +{ + struct vport *local; + + local = ovs_vport_ovsl(dp, OVSP_LOCAL); + BUG_ON(!local); + return netdev_vport_priv(local)->dev; +} + +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct netdev_vport *netdev_vport; + int err; + + vport = ovs_vport_alloc(sizeof(struct netdev_vport), + &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); + if (!netdev_vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + + if (netdev_vport->dev->flags & IFF_LOOPBACK || + netdev_vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(netdev_vport->dev)) { + err = -EINVAL; + goto error_put; + } + + rtnl_lock(); + err = netdev_master_upper_dev_link(netdev_vport->dev, + get_dpdev(vport->dp)); + if (err) + goto error_unlock; + + err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + vport); + if (err) + goto error_master_upper_dev_unlink; + + dev_disable_lro(netdev_vport->dev); + dev_set_promiscuity(netdev_vport->dev, 1); + netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + rtnl_unlock(); + + return vport; + +error_master_upper_dev_unlink: + netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); +error_unlock: + rtnl_unlock(); +error_put: + dev_put(netdev_vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void free_port_rcu(struct rcu_head *rcu) +{ + struct netdev_vport *netdev_vport = container_of(rcu, + struct netdev_vport, rcu); + + dev_put(netdev_vport->dev); + ovs_vport_free(vport_from_priv(netdev_vport)); +} + +void ovs_netdev_detach_dev(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + ASSERT_RTNL(); + netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(netdev_vport->dev); + netdev_upper_dev_unlink(netdev_vport->dev, + netdev_master_upper_dev_get(netdev_vport->dev)); + dev_set_promiscuity(netdev_vport->dev, -1); +} + +static void netdev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + rtnl_lock(); + if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + rtnl_unlock(); + + call_rcu(&netdev_vport->rcu, free_port_rcu); +} + +const char *ovs_netdev_get_name(const struct vport *vport) +{ + const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + return netdev_vport->dev->name; +} + +static unsigned int packet_length(const struct sk_buff *skb) +{ + unsigned int length = skb->len - ETH_HLEN; + + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; + + return length; +} + +static int netdev_send(struct vport *vport, struct sk_buff *skb) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + int mtu = netdev_vport->dev->mtu; + int len; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", + netdev_vport->dev->name, + packet_length(skb), mtu); + goto drop; + } + + skb->dev = netdev_vport->dev; + len = skb->len; + dev_queue_xmit(skb); + + return len; + +drop: + kfree_skb(skb); + return 0; +} + +/* Returns null if this device is not attached to a datapath. */ +struct vport *ovs_netdev_get_vport(struct net_device *dev) +{ + if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + return (struct vport *) + rcu_dereference_rtnl(dev->rx_handler_data); + else + return NULL; +} + +static struct vport_ops ovs_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_NETDEV, + .create = netdev_create, + .destroy = netdev_destroy, + .get_name = ovs_netdev_get_name, + .send = netdev_send, +}; + +int __init ovs_netdev_init(void) +{ + return ovs_vport_ops_register(&ovs_netdev_vport_ops); +} + +void ovs_netdev_exit(void) +{ + ovs_vport_ops_unregister(&ovs_netdev_vport_ops); +} diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h new file mode 100644 index 000000000..6f7038e79 --- /dev/null +++ b/net/openvswitch/vport-netdev.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_NETDEV_H +#define VPORT_NETDEV_H 1 + +#include +#include + +#include "vport.h" + +struct vport *ovs_netdev_get_vport(struct net_device *dev); + +struct netdev_vport { + struct rcu_head rcu; + + struct net_device *dev; +}; + +static inline struct netdev_vport * +netdev_vport_priv(const struct vport *vport) +{ + return vport_priv(vport); +} + +const char *ovs_netdev_get_name(const struct vport *); +void ovs_netdev_detach_dev(struct vport *); + +int __init ovs_netdev_init(void); +void ovs_netdev_exit(void); + +#endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 000000000..6d39766e7 --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" +#include "vport-vxlan.h" + +/** + * struct vxlan_port - Keeps track of open UDP ports + * @vs: vxlan_sock created for the port. + * @name: vport name. + */ +struct vxlan_port { + struct vxlan_sock *vs; + char name[IFNAMSIZ]; + u32 exts; /* VXLAN_F_* in */ +}; + +static struct vport_ops ovs_vxlan_vport_ops; + +static inline struct vxlan_port *vxlan_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +/* Called with rcu_read_lock and BH disabled. */ +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md) +{ + struct ovs_tunnel_info tun_info; + struct vxlan_port *vxlan_port; + struct vport *vport = vs->data; + struct iphdr *iph; + struct ovs_vxlan_opts opts = { + .gbp = md->gbp, + }; + __be64 key; + __be16 flags; + + flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); + vxlan_port = vxlan_vport(vport); + if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) + flags |= TUNNEL_VXLAN_OPT; + + /* Save outer tunnel values */ + iph = ip_hdr(skb); + key = cpu_to_be64(ntohl(md->vni) >> 8); + ovs_flow_tun_info_init(&tun_info, iph, + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, &opts, sizeof(opts)); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan_port->exts) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan_port->exts & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static void vxlan_tnl_destroy(struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + + vxlan_sock_release(vxlan_port->vs); + + ovs_vport_deferred_free(vport); +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; + struct vxlan_port *vxlan_port; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + vxlan_port = vxlan_vport(vport); + + if (exts[OVS_VXLAN_EXT_GBP]) + vxlan_port->exts |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct vxlan_port *vxlan_port; + struct vxlan_sock *vs; + struct vport *vport; + struct nlattr *a; + u16 dst_port; + int err; + + if (!options) { + err = -EINVAL; + goto error; + } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct vxlan_port), + &ovs_vxlan_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + vxlan_port = vxlan_vport(vport); + strncpy(vxlan_port->name, parms->name, IFNAMSIZ); + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, + vxlan_port->exts); + if (IS_ERR(vs)) { + ovs_vport_free(vport); + return (void *)vs; + } + vxlan_port->vs = vs; + + return vport; + +error: + return ERR_PTR(err); +} + +static int vxlan_ext_gbp(struct sk_buff *skb) +{ + const struct ovs_tunnel_info *tun_info; + const struct ovs_vxlan_opts *opts; + + tun_info = OVS_CB(skb)->egress_tun_info; + opts = tun_info->options; + + if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && + tun_info->options_len >= sizeof(*opts)) + return opts->gbp; + else + return 0; +} + +static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + struct sock *sk = vxlan_port->vs->sock->sk; + __be16 dst_port = inet_sk(sk)->inet_sport; + const struct ovs_key_ipv4_tunnel *tun_key; + struct vxlan_metadata md = {0}; + struct rtable *rt; + struct flowi4 fl; + __be16 src_port; + __be16 df; + int err; + u32 vxflags; + + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->ignore_df = 1; + + src_port = udp_flow_src_port(net, skb, 0, 0, true); + md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); + md.gbp = vxlan_ext_gbp(skb); + vxflags = vxlan_port->exts | + (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); + + err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, + src_port, dst_port, + &md, false, vxflags); + if (err < 0) + ip_rt_put(rt); + return err; +error: + kfree_skb(skb); + return err; +} + +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + +static const char *vxlan_get_name(const struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + return vxlan_port->name; +} + +static struct vport_ops ovs_vxlan_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_tnl_create, + .destroy = vxlan_tnl_destroy, + .get_name = vxlan_get_name, + .get_options = vxlan_get_options, + .send = vxlan_tnl_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, + .owner = THIS_MODULE, +}; + +static int __init ovs_vxlan_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_vport_ops); +} + +static void __exit ovs_vxlan_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); +} + +module_init(ovs_vxlan_tnl_init); +module_exit(ovs_vxlan_tnl_exit); + +MODULE_DESCRIPTION("OVS: VXLAN switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h new file mode 100644 index 000000000..4b08233e7 --- /dev/null +++ b/net/openvswitch/vport-vxlan.h @@ -0,0 +1,11 @@ +#ifndef VPORT_VXLAN_H +#define VPORT_VXLAN_H 1 + +#include +#include + +struct ovs_vxlan_opts { + __u32 gbp; +}; + +#endif diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c new file mode 100644 index 000000000..067a3fff1 --- /dev/null +++ b/net/openvswitch/vport.c @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" +#include "vport-internal_dev.h" + +static void ovs_vport_record_error(struct vport *, + enum vport_err_type err_type); + +static LIST_HEAD(vport_ops_list); + +/* Protected by RCU read lock for reading, ovs_mutex for writing. */ +static struct hlist_head *dev_table; +#define VPORT_HASH_BUCKETS 1024 + +/** + * ovs_vport_init - initialize vport subsystem + * + * Called at module load time to initialize the vport subsystem. + */ +int ovs_vport_init(void) +{ + dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dev_table) + return -ENOMEM; + + return 0; +} + +/** + * ovs_vport_exit - shutdown vport subsystem + * + * Called at module exit time to shutdown the vport subsystem. + */ +void ovs_vport_exit(void) +{ + kfree(dev_table); +} + +static struct hlist_head *hash_bucket(const struct net *net, const char *name) +{ + unsigned int hash = jhash(name, strlen(name), (unsigned long) net); + return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; +} + +int ovs_vport_ops_register(struct vport_ops *ops) +{ + int err = -EEXIST; + struct vport_ops *o; + + ovs_lock(); + list_for_each_entry(o, &vport_ops_list, list) + if (ops->type == o->type) + goto errout; + + list_add_tail(&ops->list, &vport_ops_list); + err = 0; +errout: + ovs_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(ovs_vport_ops_register); + +void ovs_vport_ops_unregister(struct vport_ops *ops) +{ + ovs_lock(); + list_del(&ops->list); + ovs_unlock(); +} +EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); + +/** + * ovs_vport_locate - find a port that has already been created + * + * @name: name of port to find + * + * Must be called with ovs or RCU read lock. + */ +struct vport *ovs_vport_locate(const struct net *net, const char *name) +{ + struct hlist_head *bucket = hash_bucket(net, name); + struct vport *vport; + + hlist_for_each_entry_rcu(vport, bucket, hash_node) + if (!strcmp(name, vport->ops->get_name(vport)) && + net_eq(ovs_dp_get_net(vport->dp), net)) + return vport; + + return NULL; +} + +/** + * ovs_vport_alloc - allocate and initialize new vport + * + * @priv_size: Size of private data area to allocate. + * @ops: vport device ops + * + * Allocate and initialize a new vport defined by @ops. The vport will contain + * a private data area of size @priv_size that can be accessed using + * vport_priv(). vports that are no longer needed should be released with + * vport_free(). + */ +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, + const struct vport_parms *parms) +{ + struct vport *vport; + size_t alloc_size; + + alloc_size = sizeof(struct vport); + if (priv_size) { + alloc_size = ALIGN(alloc_size, VPORT_ALIGN); + alloc_size += priv_size; + } + + vport = kzalloc(alloc_size, GFP_KERNEL); + if (!vport) + return ERR_PTR(-ENOMEM); + + vport->dp = parms->dp; + vport->port_no = parms->port_no; + vport->ops = ops; + INIT_HLIST_NODE(&vport->dp_hash_node); + + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { + kfree(vport); + return ERR_PTR(-EINVAL); + } + + vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!vport->percpu_stats) { + kfree(vport); + return ERR_PTR(-ENOMEM); + } + + return vport; +} +EXPORT_SYMBOL_GPL(ovs_vport_alloc); + +/** + * ovs_vport_free - uninitialize and free vport + * + * @vport: vport to free + * + * Frees a vport allocated with vport_alloc() when it is no longer needed. + * + * The caller must ensure that an RCU grace period has passed since the last + * time @vport was in a datapath. + */ +void ovs_vport_free(struct vport *vport) +{ + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ + kfree(rcu_dereference_raw(vport->upcall_portids)); + free_percpu(vport->percpu_stats); + kfree(vport); +} +EXPORT_SYMBOL_GPL(ovs_vport_free); + +static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) +{ + struct vport_ops *ops; + + list_for_each_entry(ops, &vport_ops_list, list) + if (ops->type == parms->type) + return ops; + + return NULL; +} + +/** + * ovs_vport_add - add vport device (for kernel callers) + * + * @parms: Information about new vport. + * + * Creates a new vport with the specified configuration (which is dependent on + * device type). ovs_mutex must be held. + */ +struct vport *ovs_vport_add(const struct vport_parms *parms) +{ + struct vport_ops *ops; + struct vport *vport; + + ops = ovs_vport_lookup(parms); + if (ops) { + struct hlist_head *bucket; + + if (!try_module_get(ops->owner)) + return ERR_PTR(-EAFNOSUPPORT); + + vport = ops->create(parms); + if (IS_ERR(vport)) { + module_put(ops->owner); + return vport; + } + + bucket = hash_bucket(ovs_dp_get_net(vport->dp), + vport->ops->get_name(vport)); + hlist_add_head_rcu(&vport->hash_node, bucket); + return vport; + } + + /* Unlock to attempt module load and return -EAGAIN if load + * was successful as we need to restart the port addition + * workflow. + */ + ovs_unlock(); + request_module("vport-type-%d", parms->type); + ovs_lock(); + + if (!ovs_vport_lookup(parms)) + return ERR_PTR(-EAFNOSUPPORT); + else + return ERR_PTR(-EAGAIN); +} + +/** + * ovs_vport_set_options - modify existing vport device (for kernel callers) + * + * @vport: vport to modify. + * @options: New configuration. + * + * Modifies an existing device with the specified configuration (which is + * dependent on device type). ovs_mutex must be held. + */ +int ovs_vport_set_options(struct vport *vport, struct nlattr *options) +{ + if (!vport->ops->set_options) + return -EOPNOTSUPP; + return vport->ops->set_options(vport, options); +} + +/** + * ovs_vport_del - delete existing vport device + * + * @vport: vport to delete. + * + * Detaches @vport from its datapath and destroys it. It is possible to fail + * for reasons such as lack of memory. ovs_mutex must be held. + */ +void ovs_vport_del(struct vport *vport) +{ + ASSERT_OVSL(); + + hlist_del_rcu(&vport->hash_node); + module_put(vport->ops->owner); + vport->ops->destroy(vport); +} + +/** + * ovs_vport_get_stats - retrieve device stats + * + * @vport: vport from which to retrieve the stats + * @stats: location to store stats + * + * Retrieves transmit, receive, and error stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + + /* We potentially have 2 sources of stats that need to be combined: + * those we have collected (split into err_stats and percpu_stats) from + * set_stats() and device error stats from netdev->get_stats() (for + * errors that happen downstream and therefore aren't reported through + * our vport_record_error() function). + * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). + * netdev-stats can be directly read over netlink-ioctl. + */ + + stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); + stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); + stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); + stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(vport->percpu_stats, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } +} + +/** + * ovs_vport_get_options - retrieve device options + * + * @vport: vport from which to retrieve the options. + * @skb: sk_buff where options should be appended. + * + * Retrieves the configuration of the given device, appending an + * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested + * vport-specific attributes to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another + * negative error code if a real error occurred. If an error occurs, @skb is + * left unmodified. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int err; + + if (!vport->ops->get_options) + return 0; + + nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); + if (!nla) + return -EMSGSIZE; + + err = vport->ops->get_options(vport, skb); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + + nla_nest_end(skb, nla); + return 0; +} + +/** + * ovs_vport_set_upcall_portids - set upcall portids of @vport. + * + * @vport: vport to modify. + * @ids: new configuration, an array of port ids. + * + * Sets the vport's upcall_portids to @ids. + * + * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed + * as an array of U32. + * + * Must be called with ovs_mutex. + */ +int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) +{ + struct vport_portids *old, *vport_portids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(vport->upcall_portids); + + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), + GFP_KERNEL); + if (!vport_portids) + return -ENOMEM; + + vport_portids->n_ids = nla_len(ids) / sizeof(u32); + vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); + nla_memcpy(vport_portids->ids, ids, nla_len(ids)); + + rcu_assign_pointer(vport->upcall_portids, vport_portids); + + if (old) + kfree_rcu(old, rcu); + return 0; +} + +/** + * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. + * + * @vport: vport from which to retrieve the portids. + * @skb: sk_buff where portids should be appended. + * + * Retrieves the configuration of the given vport, appending the + * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall + * portids to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. + * If an error occurs, @skb is left unmodified. Must be called with + * ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_portids(const struct vport *vport, + struct sk_buff *skb) +{ + struct vport_portids *ids; + + ids = rcu_dereference_ovsl(vport->upcall_portids); + + if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) + return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, + ids->n_ids * sizeof(u32), (void *)ids->ids); + else + return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); +} + +/** + * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. + * + * @vport: vport from which the missed packet is received. + * @skb: skb that the missed packet was received. + * + * Uses the skb_get_hash() to select the upcall portid to send the + * upcall. + * + * Returns the portid of the target socket. Must be called with rcu_read_lock. + */ +u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) +{ + struct vport_portids *ids; + u32 ids_index; + u32 hash; + + ids = rcu_dereference(vport->upcall_portids); + + if (ids->n_ids == 1 && ids->ids[0] == 0) + return 0; + + hash = skb_get_hash(skb); + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; +} + +/** + * ovs_vport_receive - pass up received packet to the datapath for processing + * + * @vport: vport that received the packet + * @skb: skb that was received + * @tun_key: tunnel (if any) that carried packet + * + * Must be called with rcu_read_lock. The packet cannot be shared and + * skb->data should point to the Ethernet header. + */ +void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ovs_tunnel_info *tun_info) +{ + struct pcpu_sw_netstats *stats; + struct sw_flow_key key; + int error; + + stats = this_cpu_ptr(vport->percpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + u64_stats_update_end(&stats->syncp); + + OVS_CB(skb)->input_vport = vport; + OVS_CB(skb)->egress_tun_info = NULL; + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_key_extract(tun_info, skb, &key); + if (unlikely(error)) { + kfree_skb(skb); + return; + } + ovs_dp_process_packet(skb, &key); +} +EXPORT_SYMBOL_GPL(ovs_vport_receive); + +/** + * ovs_vport_send - send a packet on a device + * + * @vport: vport on which to send the packet + * @skb: skb to send + * + * Sends the given packet and returns the length of data sent. Either ovs + * lock or rcu_read_lock must be held. + */ +int ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + int sent = vport->ops->send(vport, skb); + + if (likely(sent > 0)) { + struct pcpu_sw_netstats *stats; + + stats = this_cpu_ptr(vport->percpu_stats); + + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += sent; + u64_stats_update_end(&stats->syncp); + } else if (sent < 0) { + ovs_vport_record_error(vport, VPORT_E_TX_ERROR); + } else { + ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); + } + return sent; +} + +/** + * ovs_vport_record_error - indicate device error to generic stats layer + * + * @vport: vport that encountered the error + * @err_type: one of enum vport_err_type types to indicate the error type + * + * If using the vport generic stats layer indicate that an error of the given + * type has occurred. + */ +static void ovs_vport_record_error(struct vport *vport, + enum vport_err_type err_type) +{ + switch (err_type) { + case VPORT_E_RX_DROPPED: + atomic_long_inc(&vport->err_stats.rx_dropped); + break; + + case VPORT_E_RX_ERROR: + atomic_long_inc(&vport->err_stats.rx_errors); + break; + + case VPORT_E_TX_DROPPED: + atomic_long_inc(&vport->err_stats.tx_dropped); + break; + + case VPORT_E_TX_ERROR: + atomic_long_inc(&vport->err_stats.tx_errors); + break; + } + +} + +static void free_vport_rcu(struct rcu_head *rcu) +{ + struct vport *vport = container_of(rcu, struct vport, rcu); + + ovs_vport_free(vport); +} + +void ovs_vport_deferred_free(struct vport *vport) +{ + if (!vport) + return; + + call_rcu(&vport->rcu, free_vport_rcu); +} +EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); + +int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, + struct net *net, + const struct ovs_tunnel_info *tun_info, + u8 ipproto, + u32 skb_mark, + __be16 tp_src, + __be16 tp_dst) +{ + const struct ovs_key_ipv4_tunnel *tun_key; + struct rtable *rt; + struct flowi4 fl; + + if (unlikely(!tun_info)) + return -EINVAL; + + tun_key = &tun_info->tunnel; + + /* Route lookup to get srouce IP address. + * The process may need to be changed if the corresponding process + * in vports ops changed. + */ + rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + + /* Generate egress_tun_info based on tun_info, + * saddr, tp_src and tp_dst + */ + __ovs_flow_tun_info_init(egress_tun_info, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, + tun_key->ipv4_ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags, + tun_info->options, + tun_info->options_len); + + return 0; +} +EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); + +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *info) +{ + /* get_egress_tun_info() is only implemented on tunnel ports. */ + if (unlikely(!vport->ops->get_egress_tun_info)) + return -EINVAL; + + return vport->ops->get_egress_tun_info(vport, skb, info); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h new file mode 100644 index 000000000..bc85331a6 --- /dev/null +++ b/net/openvswitch/vport.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_H +#define VPORT_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" + +struct vport; +struct vport_parms; + +/* The following definitions are for users of the vport subsytem: */ + +struct vport_net { + struct vport __rcu *gre_vport; +}; + +int ovs_vport_init(void); +void ovs_vport_exit(void); + +struct vport *ovs_vport_add(const struct vport_parms *); +void ovs_vport_del(struct vport *); + +struct vport *ovs_vport_locate(const struct net *net, const char *name); + +void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); + +int ovs_vport_set_options(struct vport *, struct nlattr *options); +int ovs_vport_get_options(const struct vport *, struct sk_buff *); + +int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); +int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); +u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); + +int ovs_vport_send(struct vport *, struct sk_buff *); + +int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, + struct net *net, + const struct ovs_tunnel_info *tun_info, + u8 ipproto, + u32 skb_mark, + __be16 tp_src, + __be16 tp_dst); +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *info); + +/* The following definitions are for implementers of vport devices: */ + +struct vport_err_stats { + atomic_long_t rx_dropped; + atomic_long_t rx_errors; + atomic_long_t tx_dropped; + atomic_long_t tx_errors; +}; +/** + * struct vport_portids - array of netlink portids of a vport. + * must be protected by rcu. + * @rn_ids: The reciprocal value of @n_ids. + * @rcu: RCU callback head for deferred destruction. + * @n_ids: Size of @ids array. + * @ids: Array storing the Netlink socket pids to be used for packets received + * on this port that miss the flow table. + */ +struct vport_portids { + struct reciprocal_value rn_ids; + struct rcu_head rcu; + u32 n_ids; + u32 ids[]; +}; + +/** + * struct vport - one port within a datapath + * @rcu: RCU callback head for deferred destruction. + * @dp: Datapath to which this port belongs. + * @upcall_portids: RCU protected 'struct vport_portids'. + * @port_no: Index into @dp's @ports array. + * @hash_node: Element in @dev_table hash table in vport.c. + * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. + * @ops: Class structure. + * @percpu_stats: Points to per-CPU statistics used and maintained by vport + * @err_stats: Points to error statistics used and maintained by vport + * @detach_list: list used for detaching vport in net-exit call. + */ +struct vport { + struct rcu_head rcu; + struct datapath *dp; + struct vport_portids __rcu *upcall_portids; + u16 port_no; + + struct hlist_node hash_node; + struct hlist_node dp_hash_node; + const struct vport_ops *ops; + + struct pcpu_sw_netstats __percpu *percpu_stats; + + struct vport_err_stats err_stats; + struct list_head detach_list; +}; + +/** + * struct vport_parms - parameters for creating a new vport + * + * @name: New vport's name. + * @type: New vport's type. + * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if + * none was supplied. + * @dp: New vport's datapath. + * @port_no: New vport's port number. + */ +struct vport_parms { + const char *name; + enum ovs_vport_type type; + struct nlattr *options; + + /* For ovs_vport_alloc(). */ + struct datapath *dp; + u16 port_no; + struct nlattr *upcall_portids; +}; + +/** + * struct vport_ops - definition of a type of virtual port + * + * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. + * @create: Create a new vport configured as specified. On success returns + * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. + * @destroy: Destroys a vport. Must call vport_free() on the vport but not + * before an RCU grace period has elapsed. + * @set_options: Modify the configuration of an existing vport. May be %NULL + * if modification is not supported. + * @get_options: Appends vport-specific attributes for the configuration of an + * existing vport to a &struct sk_buff. May be %NULL for a vport that does not + * have any configuration. + * @get_name: Get the device's name. + * @send: Send a packet on the device. Returns the length of the packet sent, + * zero for dropped packets or negative for error. + * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for + * a packet. + */ +struct vport_ops { + enum ovs_vport_type type; + + /* Called with ovs_mutex. */ + struct vport *(*create)(const struct vport_parms *); + void (*destroy)(struct vport *); + + int (*set_options)(struct vport *, struct nlattr *); + int (*get_options)(const struct vport *, struct sk_buff *); + + /* Called with rcu_read_lock or ovs_mutex. */ + const char *(*get_name)(const struct vport *); + + int (*send)(struct vport *, struct sk_buff *); + int (*get_egress_tun_info)(struct vport *, struct sk_buff *, + struct ovs_tunnel_info *); + + struct module *owner; + struct list_head list; +}; + +enum vport_err_type { + VPORT_E_RX_DROPPED, + VPORT_E_RX_ERROR, + VPORT_E_TX_DROPPED, + VPORT_E_TX_ERROR, +}; + +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, + const struct vport_parms *); +void ovs_vport_free(struct vport *); +void ovs_vport_deferred_free(struct vport *vport); + +#define VPORT_ALIGN 8 + +/** + * vport_priv - access private data area of vport + * + * @vport: vport to access + * + * If a nonzero size was passed in priv_size of vport_alloc() a private data + * area was allocated on creation. This allows that area to be accessed and + * used for any purpose needed by the vport implementer. + */ +static inline void *vport_priv(const struct vport *vport) +{ + return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); +} + +/** + * vport_from_priv - lookup vport from private data pointer + * + * @priv: Start of private data area. + * + * It is sometimes useful to translate from a pointer to the private data + * area to the vport, such as in the case where the private data pointer is + * the result of a hash table lookup. @priv must point to the start of the + * private data area. + */ +static inline struct vport *vport_from_priv(void *priv) +{ + return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); +} + +void ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ovs_tunnel_info *); + +static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, + const void *start, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); +} + +int ovs_vport_ops_register(struct vport_ops *ops); +void ovs_vport_ops_unregister(struct vport_ops *ops); + +static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, + const struct ovs_key_ipv4_tunnel *key, + u32 mark, + struct flowi4 *fl, + u8 protocol) +{ + struct rtable *rt; + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->ipv4_dst; + fl->saddr = key->ipv4_src; + fl->flowi4_tos = RT_TOS(key->ipv4_tos); + fl->flowi4_mark = mark; + fl->flowi4_proto = protocol; + + rt = ip_route_output_key(net, fl); + return rt; +} +#endif /* vport.h */ -- cgit v1.2.3