summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /net/ipv4
Initial import
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig696
-rw-r--r--net/ipv4/Makefile62
-rw-r--r--net/ipv4/af_inet.c1838
-rw-r--r--net/ipv4/ah4.c589
-rw-r--r--net/ipv4/arp.c1375
-rw-r--r--net/ipv4/cipso_ipv4.c2357
-rw-r--r--net/ipv4/datagram.c123
-rw-r--r--net/ipv4/devinet.c2386
-rw-r--r--net/ipv4/esp4.c731
-rw-r--r--net/ipv4/fib_frontend.c1251
-rw-r--r--net/ipv4/fib_lookup.h52
-rw-r--r--net/ipv4/fib_rules.c369
-rw-r--r--net/ipv4/fib_semantics.c1328
-rw-r--r--net/ipv4/fib_trie.c2659
-rw-r--r--net/ipv4/fou.c999
-rw-r--r--net/ipv4/geneve.c453
-rw-r--r--net/ipv4/gre_demux.c367
-rw-r--r--net/ipv4/gre_offload.c268
-rw-r--r--net/ipv4/icmp.c1218
-rw-r--r--net/ipv4/igmp.c2800
-rw-r--r--net/ipv4/inet_connection_sock.c978
-rw-r--r--net/ipv4/inet_diag.c1165
-rw-r--r--net/ipv4/inet_fragment.c463
-rw-r--r--net/ipv4/inet_hashtables.c618
-rw-r--r--net/ipv4/inet_lro.c374
-rw-r--r--net/ipv4/inet_timewait_sock.c328
-rw-r--r--net/ipv4/inetpeer.c558
-rw-r--r--net/ipv4/ip_forward.c159
-rw-r--r--net/ipv4/ip_fragment.c871
-rw-r--r--net/ipv4/ip_gre.c926
-rw-r--r--net/ipv4/ip_input.c467
-rw-r--r--net/ipv4/ip_options.c663
-rw-r--r--net/ipv4/ip_output.c1591
-rw-r--r--net/ipv4/ip_sockglue.c1544
-rw-r--r--net/ipv4/ip_tunnel.c1191
-rw-r--r--net/ipv4/ip_tunnel_core.c206
-rw-r--r--net/ipv4/ip_vti.c599
-rw-r--r--net/ipv4/ipcomp.c204
-rw-r--r--net/ipv4/ipconfig.c1693
-rw-r--r--net/ipv4/ipip.c569
-rw-r--r--net/ipv4/ipmr.c2792
-rw-r--r--net/ipv4/netfilter.c207
-rw-r--r--net/ipv4/netfilter/Kconfig406
-rw-r--r--net/ipv4/netfilter/Makefile72
-rw-r--r--net/ipv4/netfilter/arp_tables.c1926
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c91
-rw-r--r--net/ipv4/netfilter/arptable_filter.c91
-rw-r--r--net/ipv4/netfilter/ip_tables.c2282
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c794
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c138
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c91
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c113
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c480
-rw-r--r--net/ipv4/netfilter/ipt_ah.c91
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c144
-rw-r--r--net/ipv4/netfilter/iptable_filter.c109
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c147
-rw-r--r--net/ipv4/netfilter/iptable_nat.c154
-rw-r--r--net/ipv4/netfilter/iptable_raw.c89
-rw-r--r--net/ipv4/netfilter/iptable_security.c109
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c542
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c469
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c428
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c129
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c161
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c397
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c631
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c481
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c153
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c311
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c149
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c83
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c1313
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c192
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c102
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c125
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c107
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c88
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c76
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c76
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c76
-rw-r--r--net/ipv4/ping.c1223
-rw-r--r--net/ipv4/proc.c540
-rw-r--r--net/ipv4/protocol.c79
-rw-r--r--net/ipv4/raw.c1100
-rw-r--r--net/ipv4/route.c2800
-rw-r--r--net/ipv4/syncookies.c401
-rw-r--r--net/ipv4/sysctl_net_ipv4.c970
-rw-r--r--net/ipv4/tcp.c3225
-rw-r--r--net/ipv4/tcp_bic.c239
-rw-r--r--net/ipv4/tcp_cong.c441
-rw-r--r--net/ipv4/tcp_cubic.c504
-rw-r--r--net/ipv4/tcp_dctcp.c345
-rw-r--r--net/ipv4/tcp_diag.c68
-rw-r--r--net/ipv4/tcp_fastopen.c313
-rw-r--r--net/ipv4/tcp_highspeed.c185
-rw-r--r--net/ipv4/tcp_htcp.c317
-rw-r--r--net/ipv4/tcp_hybla.c192
-rw-r--r--net/ipv4/tcp_illinois.c355
-rw-r--r--net/ipv4/tcp_input.c6299
-rw-r--r--net/ipv4/tcp_ipv4.c2468
-rw-r--r--net/ipv4/tcp_lp.c342
-rw-r--r--net/ipv4/tcp_memcontrol.c233
-rw-r--r--net/ipv4/tcp_metrics.c1198
-rw-r--r--net/ipv4/tcp_minisocks.c830
-rw-r--r--net/ipv4/tcp_offload.c327
-rw-r--r--net/ipv4/tcp_output.c3525
-rw-r--r--net/ipv4/tcp_probe.c300
-rw-r--r--net/ipv4/tcp_scalable.c62
-rw-r--r--net/ipv4/tcp_timer.c652
-rw-r--r--net/ipv4/tcp_vegas.c337
-rw-r--r--net/ipv4/tcp_vegas.h25
-rw-r--r--net/ipv4/tcp_veno.c232
-rw-r--r--net/ipv4/tcp_westwood.c305
-rw-r--r--net/ipv4/tcp_yeah.c255
-rw-r--r--net/ipv4/tunnel4.c192
-rw-r--r--net/ipv4/udp.c2555
-rw-r--r--net/ipv4/udp_diag.c222
-rw-r--r--net/ipv4/udp_impl.h34
-rw-r--r--net/ipv4/udp_offload.c442
-rw-r--r--net/ipv4/udp_tunnel.c108
-rw-r--r--net/ipv4/udplite.c141
-rw-r--r--net/ipv4/xfrm4_input.c158
-rw-r--r--net/ipv4/xfrm4_mode_beet.c156
-rw-r--r--net/ipv4/xfrm4_mode_transport.c80
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c125
-rw-r--r--net/ipv4/xfrm4_output.c111
-rw-r--r--net/ipv4/xfrm4_policy.c332
-rw-r--r--net/ipv4/xfrm4_protocol.c301
-rw-r--r--net/ipv4/xfrm4_state.c100
-rw-r--r--net/ipv4/xfrm4_tunnel.c117
131 files changed, 89364 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 000000000..b295af069
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,696 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+ bool "IP: multicasting"
+ help
+ This is code for addressing several networked computers at once,
+ enlarging your kernel by about 2 KB. You need multicasting if you
+ intend to participate in the MBONE, a high bandwidth network on top
+ of the Internet which carries audio and video broadcasts. More
+ information about the MBONE is on the WWW at
+ <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
+
+config IP_ADVANCED_ROUTER
+ bool "IP: advanced router"
+ ---help---
+ If you intend to run your Linux box mostly as a router, i.e. as a
+ computer that forwards and redistributes network packets, say Y; you
+ will then be presented with several options that allow more precise
+ control about the routing process.
+
+ The answer to this question won't directly affect the kernel:
+ answering N will just cause the configurator to skip all the
+ questions about advanced routing.
+
+ Note that your box can only act as a router if you enable IP
+ forwarding in your kernel; you can do that by saying Y to "/proc
+ file system support" and "Sysctl support" below and executing the
+ line
+
+ echo "1" > /proc/sys/net/ipv4/ip_forward
+
+ at boot time after the /proc file system has been mounted.
+
+ If you turn on IP forwarding, you should consider the rp_filter, which
+ automatically rejects incoming packets if the routing table entry
+ for their source address doesn't match the network interface they're
+ arriving on. This has security advantages because it prevents the
+ so-called IP spoofing, however it can pose problems if you use
+ asymmetric routing (packets from you to a host take a different path
+ than packets from that host to you) or if you operate a non-routing
+ host which has several IP addresses on different interfaces. To turn
+ rp_filter on use:
+
+ echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+ or
+ echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.txt>.
+
+ If unsure, say N here.
+
+config IP_FIB_TRIE_STATS
+ bool "FIB TRIE statistics"
+ depends on IP_ADVANCED_ROUTER
+ ---help---
+ Keep track of statistics on structure of FIB TRIE table.
+ Useful for testing and measuring TRIE performance.
+
+config IP_MULTIPLE_TABLES
+ bool "IP: policy routing"
+ depends on IP_ADVANCED_ROUTER
+ select FIB_RULES
+ ---help---
+ Normally, a router decides what to do with a received packet based
+ solely on the packet's final destination address. If you say Y here,
+ the Linux router will also be able to take the packet's source
+ address into account. Furthermore, the TOS (Type-Of-Service) field
+ of the packet can be used for routing decisions as well.
+
+ If you are interested in this, please see the preliminary
+ documentation at <http://www.compendium.com.ar/policy-routing.txt>
+ and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+ You will need supporting software from
+ <ftp://ftp.tux.org/pub/net/ip-routing/>.
+
+ If unsure, say N.
+
+config IP_ROUTE_MULTIPATH
+ bool "IP: equal cost multipath"
+ depends on IP_ADVANCED_ROUTER
+ help
+ Normally, the routing tables specify a single action to be taken in
+ a deterministic manner for a given packet. If you say Y here
+ however, it becomes possible to attach several actions to a packet
+ pattern, in effect specifying several alternative paths to travel
+ for those packets. The router considers all these paths to be of
+ equal "cost" and chooses one of them in a non-deterministic fashion
+ if a matching packet arrives.
+
+config IP_ROUTE_VERBOSE
+ bool "IP: verbose route monitoring"
+ depends on IP_ADVANCED_ROUTER
+ help
+ If you say Y here, which is recommended, then the kernel will print
+ verbose messages regarding the routing, for example warnings about
+ received packets which look strange and could be evidence of an
+ attack or a misconfigured system somewhere. The information is
+ handled by the klogd daemon which is responsible for kernel messages
+ ("man klogd").
+
+config IP_ROUTE_CLASSID
+ bool
+
+config IP_PNP
+ bool "IP: kernel level autoconfiguration"
+ help
+ This enables automatic configuration of IP addresses of devices and
+ of the routing table during kernel boot, based on either information
+ supplied on the kernel command line or by BOOTP or RARP protocols.
+ You need to say Y only for diskless machines requiring network
+ access to boot (in which case you want to say Y to "Root file system
+ on NFS" as well), because all other machines configure the network
+ in their startup scripts.
+
+config IP_PNP_DHCP
+ bool "IP: DHCP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the DHCP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does DHCP itself, providing all necessary information on the kernel
+ command line, you can say N here.
+
+ If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+ must be operating on your network. Read
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config IP_PNP_BOOTP
+ bool "IP: BOOTP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the BOOTP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does BOOTP itself, providing all necessary information on the kernel
+ command line, you can say N here. If unsure, say Y. Note that if you
+ want to use BOOTP, a BOOTP server must be operating on your network.
+ Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config IP_PNP_RARP
+ bool "IP: RARP support"
+ depends on IP_PNP
+ help
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the RARP protocol (an
+ older protocol which is being obsoleted by BOOTP and DHCP), say Y
+ here. Note that if you want to use RARP, a RARP server must be
+ operating on your network. Read
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config NET_IPIP
+ tristate "IP: tunneling"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ encapsulation of IP within IP, which sounds kind of pointless, but
+ can be useful if you want to make your (or some other) machine
+ appear on a different network than it physically is, or to use
+ mobile-IP facilities (allowing laptops to seamlessly move between
+ networks without changing their IP addresses).
+
+ Saying Y to this option will produce two modules ( = code which can
+ be inserted in and removed from the running kernel whenever you
+ want). Most people won't need this and can say N.
+
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
+config NET_IP_TUNNEL
+ tristate
+ default n
+
+config NET_IPGRE
+ tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+config NET_IPGRE_BROADCAST
+ bool "IP: broadcast GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE
+ help
+ One application of GRE/IP is to construct a broadcast WAN (Wide Area
+ Network), which looks like a normal Ethernet LAN (Local Area
+ Network), but can be distributed all over the Internet. If you want
+ to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+ bool "IP: multicast routing"
+ depends on IP_MULTICAST
+ help
+ This is used if you want your machine to act as a router for IP
+ packets that have several destination addresses. It is needed on the
+ MBONE, a high bandwidth network on top of the Internet which carries
+ audio and video broadcasts. In order to do that, you would most
+ likely run the program mrouted. If you haven't heard about it, you
+ don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+ bool "IP: multicast policy routing"
+ depends on IP_MROUTE && IP_ADVANCED_ROUTER
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
+
+config IP_PIMSM_V1
+ bool "IP: PIM-SM version 1 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM (Protocol Independent
+ Multicast) version 1. This multicast routing protocol is used widely
+ because Cisco supports it. You need special software to use it
+ (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+ information about PIM.
+
+ Say Y if you want to use PIM-SM v1. Note that you can say N here if
+ you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+ bool "IP: PIM-SM version 2 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM version 2. In order to use
+ this, you need an experimental routing daemon supporting it (pimd or
+ gated-5). This routing protocol is not used widely, so say N unless
+ you want to play with it.
+
+config SYN_COOKIES
+ bool "IP: TCP syncookie support"
+ ---help---
+ Normal TCP/IP networking is open to an attack known as "SYN
+ flooding". This denial-of-service attack prevents legitimate remote
+ users from being able to connect to your computer during an ongoing
+ attack and requires very little work from the attacker, who can
+ operate from anywhere on the Internet.
+
+ SYN cookies provide protection against this type of attack. If you
+ say Y here, the TCP/IP stack will use a cryptographic challenge
+ protocol known as "SYN cookies" to enable legitimate users to
+ continue to connect, even when your machine is under attack. There
+ is no need for the legitimate users to change their TCP/IP software;
+ SYN cookies work transparently to them. For technical information
+ about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+ If you are SYN flooded, the source address reported by the kernel is
+ likely to have been forged by the attacker; it is only reported as
+ an aid in tracing the packets to their actual source and should not
+ be taken as absolute truth.
+
+ SYN cookies may prevent correct error reporting on clients when the
+ server is really overloaded. If this happens frequently better turn
+ them off.
+
+ If you say Y here, you can disable SYN cookies at run time by
+ saying Y to "/proc file system support" and
+ "Sysctl support" below and executing the command
+
+ echo 0 > /proc/sys/net/ipv4/tcp_syncookies
+
+ after the /proc file system has been mounted.
+
+ If unsure, say N.
+
+config NET_IPVTI
+ tristate "Virtual (secure) IP: tunneling"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ depends on INET_XFRM_MODE_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
+config NET_UDP_TUNNEL
+ tristate
+ select NET_IP_TUNNEL
+ default n
+
+config NET_FOU
+ tristate "IP: Foo (IP protocols) over UDP"
+ select XFRM
+ select NET_UDP_TUNNEL
+ ---help---
+ Foo over UDP allows any IP protocol to be directly encapsulated
+ over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
+ network mechanisms and optimizations for UDP (such as ECMP
+ and RSS) can be leveraged to provide better service.
+
+config NET_FOU_IP_TUNNELS
+ bool "IP: FOU encapsulation of IP tunnels"
+ depends on NET_IPIP || NET_IPGRE || IPV6_SIT
+ select NET_FOU
+ ---help---
+ Allow configuration of FOU or GUE encapsulation for IP tunnels.
+ When this option is enabled IP tunnels can be configured to use
+ FOU or GUE encapsulation.
+
+config GENEVE
+ tristate "Generic Network Virtualization Encapsulation (Geneve)"
+ depends on INET
+ select NET_UDP_TUNNEL
+ ---help---
+ This allows one to create Geneve virtual interfaces that provide
+ Layer 2 Networks over Layer 3 Networks. Geneve is often used
+ to tunnel virtual network infrastructure in virtualized environments.
+ For more information see:
+ http://tools.ietf.org/html/draft-gross-geneve-01
+
+ To compile this driver as a module, choose M here: the module
+
+
+config INET_AH
+ tristate "IP: AH transformation"
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ ---help---
+ Support for IPsec AH.
+
+ If unsure, say Y.
+
+config INET_ESP
+ tristate "IP: ESP transformation"
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_AUTHENC
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_CBC
+ select CRYPTO_SHA1
+ select CRYPTO_DES
+ ---help---
+ Support for IPsec ESP.
+
+ If unsure, say Y.
+
+config INET_IPCOMP
+ tristate "IP: IPComp transformation"
+ select INET_XFRM_TUNNEL
+ select XFRM_IPCOMP
+ ---help---
+ Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+ typically needed for IPsec.
+
+ If unsure, say Y.
+
+config INET_XFRM_TUNNEL
+ tristate
+ select INET_TUNNEL
+ default n
+
+config INET_TUNNEL
+ tristate
+ default n
+
+config INET_XFRM_MODE_TRANSPORT
+ tristate "IP: IPsec transport mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec transport mode.
+
+ If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+ tristate "IP: IPsec tunnel mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec tunnel mode.
+
+ If unsure, say Y.
+
+config INET_XFRM_MODE_BEET
+ tristate "IP: IPsec BEET mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec BEET mode.
+
+ If unsure, say Y.
+
+config INET_LRO
+ tristate "Large Receive Offload (ipv4/tcp)"
+ default y
+ ---help---
+ Support for Large Receive Offload (ipv4/tcp).
+
+ If unsure, say Y.
+
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
+ default y
+ ---help---
+ Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
+
+ If unsure, say Y.
+
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
+config INET_UDP_DIAG
+ tristate "UDP: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for UDP socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
+menuconfig TCP_CONG_ADVANCED
+ bool "TCP: advanced congestion control"
+ ---help---
+ Support for selection of various TCP congestion control
+ modules.
+
+ Nearly all users can safely say no here, and a safe default
+ selection will be made (CUBIC with new Reno as a fallback).
+
+ If unsure, say N.
+
+if TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+ tristate "Binary Increase Congestion (BIC) control"
+ default m
+ ---help---
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_CUBIC
+ tristate "CUBIC TCP"
+ default y
+ ---help---
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
+config TCP_CONG_WESTWOOD
+ tristate "TCP Westwood+"
+ default m
+ ---help---
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+ tristate "H-TCP"
+ default m
+ ---help---
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+ tristate "High Speed TCP"
+ default n
+ ---help---
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+ tristate "TCP-Hybla congestion control algorithm"
+ default n
+ ---help---
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
+
+config TCP_CONG_VEGAS
+ tristate "TCP Vegas"
+ default n
+ ---help---
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
+
+config TCP_CONG_SCALABLE
+ tristate "Scalable TCP"
+ default n
+ ---help---
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
+
+config TCP_CONG_LP
+ tristate "TCP Low Priority"
+ default n
+ ---help---
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+ tristate "TCP Veno"
+ default n
+ ---help---
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+
+config TCP_CONG_YEAH
+ tristate "YeAH TCP"
+ select TCP_CONG_VEGAS
+ default n
+ ---help---
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
+
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+ tristate "TCP Illinois"
+ default n
+ ---help---
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
+
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+config TCP_CONG_DCTCP
+ tristate "DataCenter TCP (DCTCP)"
+ default n
+ ---help---
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
+
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
+
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
+
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
+choice
+ prompt "Default TCP congestion control"
+ default DEFAULT_CUBIC
+ help
+ Select the TCP congestion control that will be used by default
+ for all connections.
+
+ config DEFAULT_BIC
+ bool "Bic" if TCP_CONG_BIC=y
+
+ config DEFAULT_CUBIC
+ bool "Cubic" if TCP_CONG_CUBIC=y
+
+ config DEFAULT_HTCP
+ bool "Htcp" if TCP_CONG_HTCP=y
+
+ config DEFAULT_HYBLA
+ bool "Hybla" if TCP_CONG_HYBLA=y
+
+ config DEFAULT_VEGAS
+ bool "Vegas" if TCP_CONG_VEGAS=y
+
+ config DEFAULT_YEAH
+ bool "YeAH" if TCP_CONG_YEAH=y
+
+ config DEFAULT_VENO
+ bool "Veno" if TCP_CONG_VENO=y
+
+ config DEFAULT_WESTWOOD
+ bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+ config DEFAULT_DCTCP
+ bool "DCTCP" if TCP_CONG_DCTCP=y
+
+ config DEFAULT_RENO
+ bool "Reno"
+endchoice
+
+endif
+
+config TCP_CONG_CUBIC
+ tristate
+ depends on !TCP_CONG_ADVANCED
+ default y
+
+config DEFAULT_TCP_CONG
+ string
+ default "bic" if DEFAULT_BIC
+ default "cubic" if DEFAULT_CUBIC
+ default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
+ default "vegas" if DEFAULT_VEGAS
+ default "yeah" if DEFAULT_YEAH
+ default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
+ default "reno" if DEFAULT_RENO
+ default "dctcp" if DEFAULT_DCTCP
+ default "cubic"
+
+config TCP_MD5SIG
+ bool "TCP: MD5 Signature Option support (RFC2385)"
+ select CRYPTO
+ select CRYPTO_MD5
+ ---help---
+ RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+ Its main (only?) use is to protect BGP sessions between core routers
+ on the Internet.
+
+ If unsure, say N.
+
+config TCP_STEALTH
+ bool "TCP: Stealth TCP socket support"
+ default n
+ ---help---
+ This option enables support for stealth TCP sockets. If you do not
+ know what this means, you do not need it.
+
+ If unsure, say N.
+
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 000000000..518c04ed6
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,62 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y := route.o inetpeer.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o inet_hashtables.o \
+ inet_timewait_sock.o inet_connection_sock.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o
+obj-$(CONFIG_NET_FOU) += fou.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
+obj-$(CONFIG_INET_LRO) += inet_lro.o
+obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_GENEVE) += geneve.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+ xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 000000000..a5aa54ea6
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1838 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET protocol family socket handler.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ * piggy,
+ * Karl Knutson : Socket protocol table
+ * A.N.Kuznetsov : Socket death error in accept().
+ * John Richardson : Fix non blocking error in connect()
+ * so sockets that fail to connect
+ * don't return -EINPROGRESS.
+ * Alan Cox : Asynchronous I/O support
+ * Alan Cox : Keep correct socket pointer on sock
+ * structures
+ * when accept() ed
+ * Alan Cox : Semantics of SO_LINGER aren't state
+ * moved to close when you look carefully.
+ * With this fixed and the accept bug fixed
+ * some RPC stuff seems happier.
+ * Niibe Yutaka : 4.4BSD style write async I/O
+ * Alan Cox,
+ * Tony Gale : Fixed reuse semantics.
+ * Alan Cox : bind() shouldn't abort existing but dead
+ * sockets. Stops FTP netin:.. I hope.
+ * Alan Cox : bind() works correctly for RAW sockets.
+ * Note that FreeBSD at least was broken
+ * in this respect so be careful with
+ * compatibility tests...
+ * Alan Cox : routing cache support
+ * Alan Cox : memzero the socket structure for
+ * compactness.
+ * Matt Day : nonblock connect error handler
+ * Alan Cox : Allow large numbers of pending sockets
+ * (eg for big web sites), but only if
+ * specifically application requested.
+ * Alan Cox : New buffering throughout IP. Used
+ * dumbly.
+ * Alan Cox : New buffering now used smartly.
+ * Alan Cox : BSD rather than common sense
+ * interpretation of listen.
+ * Germano Caronni : Assorted small races.
+ * Alan Cox : sendmsg/recvmsg basic support.
+ * Alan Cox : Only sendmsg/recvmsg now supported.
+ * Alan Cox : Locked down bind (see security list).
+ * Alan Cox : Loosened bind a little.
+ * Mike McLagan : ADD/DEL DLCI Ioctls
+ * Willy Konynenberg : Transparent proxying support.
+ * David S. Miller : New socket lookup architecture.
+ * Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
+ * Andi Kleen : Fix inet_stream_connect TCP race.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/secure_seq.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ sk_mem_reclaim(sk);
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+ pr_err("Attempt to release TCP socket in state %d %p\n",
+ sk->sk_state, sk);
+ return;
+ }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive inet socket %p\n", sk);
+ return;
+ }
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
+ sk_refcnt_debug_dec(sk);
+}
+EXPORT_SYMBOL(inet_sock_destruct);
+
+/*
+ * The routines beyond this point handle the behaviour of an AF_INET
+ * socket object. Mostly it punts to the subprotocols of IP to do
+ * the work.
+ */
+
+/*
+ * Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+ struct inet_sock *inet;
+ /* We may need to bind the socket. */
+ lock_sock(sk);
+ inet = inet_sk(sk);
+ if (!inet->inet_num) {
+ if (sk->sk_prot->get_port(sk, 0)) {
+ release_sock(sk);
+ return -EAGAIN;
+ }
+ inet->inet_sport = htons(inet->inet_num);
+ }
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != TCP_LISTEN) {
+ /* Check special setups for testing purpose to enable TFO w/o
+ * requiring TCP_FASTOPEN sockopt.
+ * Note that only TCP sockets (SOCK_STREAM) will reach here.
+ * Also fastopenq may already been allocated because this
+ * socket was in TCP_LISTEN state previously but was
+ * shutdown() (rather than close()).
+ */
+ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+ !inet_csk(sk)->icsk_accept_queue.fastopenq) {
+ if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+ err = fastopen_init_queue(sk, backlog);
+ else if ((sysctl_tcp_fastopen &
+ TFO_SERVER_WO_SOCKOPT2) != 0)
+ err = fastopen_init_queue(sk,
+ ((uint)sysctl_tcp_fastopen) >> 16);
+ else
+ err = 0;
+ if (err)
+ goto out;
+
+ tcp_fastopen_init_key_once(true);
+ }
+ err = inet_csk_listen_start(sk, backlog);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_listen);
+
+/*
+ * Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct inet_protosw *answer;
+ struct inet_sock *inet;
+ struct proto *answer_prot;
+ unsigned char answer_flags;
+ int try_loading_module = 0;
+ int err;
+
+ sock->state = SS_UNCONNECTED;
+
+ /* Look for the requested type/protocol pair. */
+lookup_protocol:
+ err = -ESOCKTNOSUPPORT;
+ rcu_read_lock();
+ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+
+ err = 0;
+ /* Check the non-wild match. */
+ if (protocol == answer->protocol) {
+ if (protocol != IPPROTO_IP)
+ break;
+ } else {
+ /* Check for the two wild cases. */
+ if (IPPROTO_IP == protocol) {
+ protocol = answer->protocol;
+ break;
+ }
+ if (IPPROTO_IP == answer->protocol)
+ break;
+ }
+ err = -EPROTONOSUPPORT;
+ }
+
+ if (unlikely(err)) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-2-proto-132-type-1
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-2-proto-132
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
+ err = -EPERM;
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out_rcu_unlock;
+
+ sock->ops = answer->ops;
+ answer_prot = answer->prot;
+ answer_flags = answer->flags;
+ rcu_read_unlock();
+
+ WARN_ON(!answer_prot->slab);
+
+ err = -ENOBUFS;
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+ if (!sk)
+ goto out;
+
+ err = 0;
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = SK_CAN_REUSE;
+
+ inet = inet_sk(sk);
+ inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+
+ inet->nodefrag = 0;
+
+ if (SOCK_RAW == sock->type) {
+ inet->inet_num = protocol;
+ if (IPPROTO_RAW == protocol)
+ inet->hdrincl = 1;
+ }
+
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+ else
+ inet->pmtudisc = IP_PMTUDISC_WANT;
+
+ inet->inet_id = 0;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_destruct = inet_sock_destruct;
+ sk->sk_protocol = protocol;
+ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+ inet->uc_ttl = -1;
+ inet->mc_loop = 1;
+ inet->mc_ttl = 1;
+ inet->mc_all = 1;
+ inet->mc_index = 0;
+ inet->mc_list = NULL;
+ inet->rcv_tos = 0;
+
+ sk_refcnt_debug_inc(sk);
+
+ if (inet->inet_num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically
+ * shares.
+ */
+ inet->inet_sport = htons(inet->inet_num);
+ /* Add to protocol hash chains. */
+ sk->sk_prot->hash(sk);
+ }
+
+ if (sk->sk_prot->init) {
+ err = sk->sk_prot->init(sk);
+ if (err)
+ sk_common_release(sk);
+ }
+out:
+ return err;
+out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+
+/*
+ * The peer socket should always be NULL (or else). When we call this
+ * function we are destroying the object and from then on nobody
+ * should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ long timeout;
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+
+ /* If linger is set, we don't return until the close
+ * is complete. Otherwise we return immediately. The
+ * actually closing is done the same either way.
+ *
+ * If the close is due to the process exiting, we never
+ * linger..
+ */
+ timeout = 0;
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, timeout);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(inet_release);
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ unsigned short snum;
+ int chk_addr_ret;
+ int err;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if (sk->sk_prot->bind) {
+ err = sk->sk_prot->bind(sk, uaddr, addr_len);
+ goto out;
+ }
+ err = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+ }
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ /* Not specified by any standard per-se, however it breaks too
+ * many applications when removed. It is unfortunate since
+ * allowing applications to make a non-local bind solves
+ * several problems with systems using dynamic addressing.
+ * (ie. your servers still start up even if your ISDN link
+ * is temporarily down)
+ */
+ err = -EADDRNOTAVAIL;
+ if (!net->ipv4.sysctl_ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
+ chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST &&
+ chk_addr_ret != RTN_BROADCAST)
+ goto out;
+
+ snum = ntohs(addr->sin_port);
+ err = -EACCES;
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+ goto out;
+
+ /* We keep a pair of addresses. rcv_saddr is the one
+ * used by hash lookups, and saddr is used for transmit.
+ *
+ * In the BSD API these are the same except where it
+ * would be illegal to use them (multicast/broadcast) in
+ * which case the sending device address is used.
+ */
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
+ goto out_release_sock;
+
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->inet_saddr = 0; /* Use device */
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->sk_prot->get_port(sk, snum)) {
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ err = -EADDRINUSE;
+ goto out_release_sock;
+ }
+
+ if (inet->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ sk_dst_reset(sk);
+ err = 0;
+out_release_sock:
+ release_sock(sk);
+out:
+ return err;
+}
+EXPORT_SYMBOL(inet_bind);
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ return -EAGAIN;
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_dgram_connect);
+
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending += writebias;
+
+ /* Basic assumption: if someone sets sk->sk_err, he _must_
+ * change state of the socket from TCP_SYN_*.
+ * Connect() does not allow to get error notifications
+ * without closing the socket.
+ */
+ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ if (signal_pending(current) || !timeo)
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
+ return timeo;
+}
+
+/*
+ * Connect to a remote host. There is regrettably still a little
+ * TCP 'magic' in here.
+ */
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ long timeo;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
+
+ switch (sock->state) {
+ default:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ /* Fall out of switch with err, set for this state */
+ break;
+ case SS_UNCONNECTED:
+ err = -EISCONN;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out;
+
+ err = sk->sk_prot->connect(sk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ sock->state = SS_CONNECTING;
+
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ err = -EINPROGRESS;
+ break;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
+ /* Error code is set above */
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
+ goto out;
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /* Connection was closed by RST, timeout, ICMP error
+ * or another process disconnected us.
+ */
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+ * and error was received after socket entered established state.
+ * Hence, it is handled normally after connect() return successfully.
+ */
+
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ return err;
+
+sock_error:
+ err = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ if (sk->sk_prot->disconnect(sk, flags))
+ sock->state = SS_DISCONNECTING;
+ goto out;
+}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_stream_connect);
+
+/*
+ * Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk1 = sock->sk;
+ int err = -EINVAL;
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+ if (!sk2)
+ goto do_err;
+
+ lock_sock(sk2);
+
+ sock_rps_record_flow(sk2);
+ WARN_ON(!((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+
+ sock_graft(sk2, newsock);
+
+ newsock->state = SS_CONNECTED;
+ err = 0;
+ release_sock(sk2);
+do_err:
+ return err;
+}
+EXPORT_SYMBOL(inet_accept);
+
+
+/*
+ * This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+
+ sin->sin_family = AF_INET;
+ if (peer) {
+ if (!inet->inet_dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1))
+ return -ENOTCONN;
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ } else {
+ __be32 addr = inet->inet_rcv_saddr;
+ if (!addr)
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
+ sin->sin_addr.s_addr = addr;
+ }
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+EXPORT_SYMBOL(inet_getname);
+
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
+ return -EAGAIN;
+
+ return sk->sk_prot->sendmsg(sk, msg, size);
+}
+EXPORT_SYMBOL(inet_sendmsg);
+
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
+ return -EAGAIN;
+
+ if (sk->sk_prot->sendpage)
+ return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+ return sock_no_sendpage(sock, page, offset, size, flags);
+}
+EXPORT_SYMBOL(inet_sendpage);
+
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ sock_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
+
+int inet_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* This should really check to make sure
+ * the socket is a TCP socket. (WHY AC...)
+ */
+ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+ 1->2 bit 2 snds.
+ 2->3 */
+ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ err = -ENOTCONN;
+ /* Hack to wake up other listeners, who can poll for
+ POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ default:
+ sk->sk_shutdown |= how;
+ if (sk->sk_prot->shutdown)
+ sk->sk_prot->shutdown(sk, how);
+ break;
+
+ /* Remaining two branches are temporary solution for missing
+ * close() in multithreaded environment. It is _not_ a good idea,
+ * but we have no choice until close() is repaired at VFS level.
+ */
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* Fall through */
+ case TCP_SYN_SENT:
+ err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_shutdown);
+
+/*
+ * ioctl() calls you can issue on an INET socket. Most of these are
+ * device configuration and stuff and very rarely used. Some ioctls
+ * pass on to the socket itself.
+ *
+ * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ * loads the devconfigure module does its configuring and unloads it.
+ * There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ struct net *net = sock_net(sk);
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCGSTAMPNS:
+ err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ err = devinet_ioctl(net, cmd, (void __user *)arg);
+ break;
+ default:
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ return err;
+}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
+const struct proto_ops inet_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ .poll = tcp_poll,
+ .ioctl = inet_ioctl,
+ .listen = inet_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+ .splice_read = tcp_splice_read,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_stream_ops);
+
+const struct proto_ops inet_dgram_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = udp_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_dgram_ops);
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static const struct proto_ops inet_sockraw_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = datagram_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+
+static const struct net_proto_family inet_family_ops = {
+ .family = PF_INET,
+ .create = inet_create,
+ .owner = THIS_MODULE,
+};
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+ {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcp_prot,
+ .ops = &inet_stream_ops,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udp_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+ },
+
+ {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &raw_prot,
+ .ops = &inet_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
+ }
+};
+
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+ struct list_head *lh;
+ struct inet_protosw *answer;
+ int protocol = p->protocol;
+ struct list_head *last_perm;
+
+ spin_lock_bh(&inetsw_lock);
+
+ if (p->type >= SOCK_MAX)
+ goto out_illegal;
+
+ /* If we are trying to override a permanent protocol, bail. */
+ answer = NULL;
+ last_perm = &inetsw[p->type];
+ list_for_each(lh, &inetsw[p->type]) {
+ answer = list_entry(lh, struct inet_protosw, list);
+
+ /* Check only the non-wild match. */
+ if (INET_PROTOSW_PERMANENT & answer->flags) {
+ if (protocol == answer->protocol)
+ break;
+ last_perm = lh;
+ }
+
+ answer = NULL;
+ }
+ if (answer)
+ goto out_permanent;
+
+ /* Add the new entry after the last permanent entry if any, so that
+ * the new entry does not override a permanent entry when matched with
+ * a wild-card protocol. But it is allowed to override any existing
+ * non-permanent entry. This means that when we remove this entry, the
+ * system automatically returns to the old behavior.
+ */
+ list_add_rcu(&p->list, last_perm);
+out:
+ spin_unlock_bh(&inetsw_lock);
+
+ return;
+
+out_permanent:
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
+ goto out;
+
+out_illegal:
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
+ p->type);
+ goto out;
+}
+EXPORT_SYMBOL(inet_register_protosw);
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+ if (INET_PROTOSW_PERMANENT & p->flags) {
+ pr_err("Attempt to unregister permanent protocol %d\n",
+ p->protocol);
+ } else {
+ spin_lock_bh(&inetsw_lock);
+ list_del_rcu(&p->list);
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+ }
+}
+EXPORT_SYMBOL(inet_unregister_protosw);
+
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr __read_mostly;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 old_saddr = inet->inet_saddr;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 new_saddr;
+ struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* Query new route. */
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ sk_setup_caps(sk, &rt->dst);
+
+ new_saddr = fl4->saddr;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
+ }
+
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
+
+ /*
+ * XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __sk_prot_rehash(sk);
+ return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ __be32 daddr;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ daddr = inet->inet_daddr;
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rcu_read_unlock();
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
+ sk_setup_caps(sk, &rt->dst);
+ } else {
+ err = PTR_ERR(rt);
+
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+ /*
+ * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+ */
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = inet_sk_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ unsigned int offset = 0;
+ bool udpfrag, encap;
+ struct iphdr *iph;
+ int proto;
+ int nhoff;
+ int ihl;
+ int id;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ 0)))
+ goto out;
+
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ goto out;
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+ if (ihl < sizeof(*iph))
+ goto out;
+
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
+ if (unlikely(!pskb_may_pull(skb, ihl)))
+ goto out;
+ __skb_pull(skb, ihl);
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features &= skb->dev->hw_enc_features;
+ SKB_GSO_CB(skb)->encap_level += ihl;
+
+ skb_reset_transport_header(skb);
+
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ if (IS_ERR_OR_NULL(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
+ iph->id = htons(id);
+ iph->frag_off = htons(offset >> 3);
+ if (skb->next)
+ iph->frag_off |= htons(IP_MF);
+ offset += skb->len - nhoff - ihl;
+ } else {
+ iph->id = htons(id++);
+ }
+ iph->tot_len = htons(skb->len - nhoff);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
+ } while ((skb = skb->next));
+
+out:
+ return segs;
+}
+
+static struct sk_buff **inet_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
+ int flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
+
+ proto = iph->protocol;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ if (*(u8 *)iph != 0x45)
+ goto out_unlock;
+
+ if (unlikely(ip_fast_csum((u8 *)iph, 5)))
+ goto out_unlock;
+
+ id = ntohl(*(__be32 *)&iph->id);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
+ id >>= 16;
+
+ for (p = *head; p; p = p->next) {
+ struct iphdr *iph2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
+ if ((iph->protocol ^ iph2->protocol) |
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* All fields must match except length and checksum. */
+ NAPI_GRO_CB(p)->flush |=
+ (iph->ttl ^ iph2->ttl) |
+ (iph->tos ^ iph2->tos) |
+ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+
+ /* Save the IP ID check to be included later when we get to
+ * the transport layer so only the inner most IP ID is checked.
+ * This is because some GSO/TSO implementations do not
+ * correctly increment the IP ID for the outer hdrs.
+ */
+ NAPI_GRO_CB(p)->flush_id =
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+ NAPI_GRO_CB(p)->flush |= flush;
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_set_network_header(skb, off);
+ /* The above will be needed by the transport layer if there is one
+ * immediately following this IP hdr.
+ */
+
+ /* Note : No need to call skb_gro_postpull_rcsum() here,
+ * as we already checked checksum over ipv4 header was 0
+ */
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+ if (sk->sk_family == AF_INET)
+ return ip_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
+#endif
+ return -EINVAL;
+}
+
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ __be16 newlen = htons(skb->len - nhoff);
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ int proto = iph->protocol;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
+
+ csum_replace2(&iph->check, iph->tot_len, newlen);
+ iph->tot_len = newlen;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+ unsigned short type, unsigned char protocol,
+ struct net *net)
+{
+ struct socket *sock;
+ int rc = sock_create_kern(family, type, protocol, &sock);
+
+ if (rc == 0) {
+ *sk = sock->sk;
+ (*sk)->sk_allocation = GFP_ATOMIC;
+ /*
+ * Unhash it so that IP input processing does not even see it,
+ * we do not wish this socket to see incoming packets.
+ */
+ (*sk)->sk_prot->unhash(*sk);
+
+ sk_change_net(*sk, net);
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
+{
+ unsigned long res = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field);
+
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
+{
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib, cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
+
+ res += v;
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+#ifdef CONFIG_IP_MULTICAST
+static const struct net_protocol igmp_protocol = {
+ .handler = igmp_rcv,
+ .netns_ok = 1,
+};
+#endif
+
+static const struct net_protocol tcp_protocol = {
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
+};
+
+static const struct net_protocol udp_protocol = {
+ .early_demux = udp_v4_early_demux,
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol icmp_protocol = {
+ .handler = icmp_rcv,
+ .err_handler = icmp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static __net_init int ipv4_mib_init_net(struct net *net)
+{
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
+ goto err_tcp_mib;
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
+ goto err_ip_mib;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
+
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
+ goto err_net_mib;
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
+ goto err_udp_mib;
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
+ goto err_udplite_mib;
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
+ goto err_icmp_mib;
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
+ goto err_icmpmsg_mib;
+
+ tcp_mib_init(net);
+ return 0;
+
+err_icmpmsg_mib:
+ free_percpu(net->mib.icmp_statistics);
+err_icmp_mib:
+ free_percpu(net->mib.udplite_statistics);
+err_udplite_mib:
+ free_percpu(net->mib.udp_statistics);
+err_udp_mib:
+ free_percpu(net->mib.net_statistics);
+err_net_mib:
+ free_percpu(net->mib.ip_statistics);
+err_ip_mib:
+ free_percpu(net->mib.tcp_statistics);
+err_tcp_mib:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+ kfree(net->mib.icmpmsg_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+ .init = ipv4_mib_init_net,
+ .exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
+static int ipv4_proc_init(void);
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_offload ip_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (udpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+
+ dev_add_offload(&ip_packet_offload);
+ inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .func = ip_rcv,
+};
+
+static int __init inet_init(void)
+{
+ struct inet_protosw *q;
+ struct list_head *r;
+ int rc = -EINVAL;
+
+ sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
+
+ rc = proto_register(&tcp_prot, 1);
+ if (rc)
+ goto out;
+
+ rc = proto_register(&udp_prot, 1);
+ if (rc)
+ goto out_unregister_tcp_proto;
+
+ rc = proto_register(&raw_prot, 1);
+ if (rc)
+ goto out_unregister_udp_proto;
+
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
+ /*
+ * Tell SOCKET that we are alive...
+ */
+
+ (void)sock_register(&inet_family_ops);
+
+#ifdef CONFIG_SYSCTL
+ ip_static_sysctl_init();
+#endif
+
+ /*
+ * Add all the base protocols.
+ */
+
+ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
+ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
+#ifdef CONFIG_IP_MULTICAST
+ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
+#endif
+
+ /* Register the socket-side information for inet_create. */
+ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+ inet_register_protosw(q);
+
+ /*
+ * Set the ARP module up
+ */
+
+ arp_init();
+
+ /*
+ * Set the IP module up
+ */
+
+ ip_init();
+
+ tcp_v4_init();
+
+ /* Setup TCP slab cache for open requests. */
+ tcp_init();
+
+ /* Setup UDP memory threshold */
+ udp_init();
+
+ /* Add UDP-Lite (RFC 3828) */
+ udplite4_register();
+
+ ping_init();
+
+ /*
+ * Set the ICMP layer up
+ */
+
+ if (icmp_init() < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ /*
+ * Initialise the multicast router
+ */
+#if defined(CONFIG_IP_MROUTE)
+ if (ip_mr_init())
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
+#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
+ /*
+ * Initialise per-cpu ipv4 mibs
+ */
+
+ if (init_ipv4_mibs())
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
+
+ ipv4_proc_init();
+
+ ipfrag_init();
+
+ dev_add_pack(&ip_packet_type);
+
+ rc = 0;
+out:
+ return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
+out_unregister_udp_proto:
+ proto_unregister(&udp_prot);
+out_unregister_tcp_proto:
+ proto_unregister(&tcp_prot);
+ goto out;
+}
+
+fs_initcall(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+static int __init ipv4_proc_init(void)
+{
+ int rc = 0;
+
+ if (raw_proc_init())
+ goto out_raw;
+ if (tcp4_proc_init())
+ goto out_tcp;
+ if (udp4_proc_init())
+ goto out_udp;
+ if (ping_proc_init())
+ goto out_ping;
+ if (ip_misc_proc_init())
+ goto out_misc;
+out:
+ return rc;
+out_misc:
+ ping_proc_exit();
+out_ping:
+ udp4_proc_exit();
+out_udp:
+ tcp4_proc_exit();
+out_tcp:
+ raw_proc_exit();
+out_raw:
+ rc = -ENOMEM;
+ goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 000000000..ac9a32ec3
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,589 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/scatterlist.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash) +
+ (crypto_ahash_alignmask(ahash) &
+ ~(crypto_tfm_ctx_alignment() - 1));
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+ unsigned int offset)
+{
+ return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
+{
+ unsigned char *optptr = (unsigned char *)(iph+1);
+ int l = iph->ihl*4 - sizeof(struct iphdr);
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return 0;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return -EINVAL;
+ switch (*optptr) {
+ case IPOPT_SEC:
+ case 0x85: /* Some "Extended Security" crap. */
+ case IPOPT_CIPSO:
+ case IPOPT_RA:
+ case 0x80|21: /* RFC1770 */
+ break;
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ if (optlen < 6)
+ return -EINVAL;
+ memcpy(daddr, optptr+optlen-4, 4);
+ /* Fall through */
+ default:
+ memset(optptr, 0, optlen);
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+ return 0;
+}
+
+static void ah_output_done(struct crypto_async_request *base, int err)
+{
+ u8 *icv;
+ struct iphdr *iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct iphdr *top_iph = ip_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+
+ iph = AH_SKB_CB(skb)->tmp;
+ icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ int nfrags;
+ int ihl;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *top_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ skb_push(skb, -skb_network_offset(skb));
+ ah = ip_auth_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+ err = -ENOMEM;
+ iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
+ if (!iph)
+ goto out;
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ top_iph = ip_hdr(skb);
+
+ iph->tos = top_iph->tos;
+ iph->ttl = top_iph->ttl;
+ iph->frag_off = top_iph->frag_off;
+
+ if (top_iph->ihl != 5) {
+ iph->daddr = top_iph->daddr;
+ memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+ if (err)
+ goto out_free;
+ }
+
+ ah->nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
+
+ top_iph->tos = 0;
+ top_iph->tot_len = htons(skb->len);
+ top_iph->frag_off = 0;
+ top_iph->ttl = 0;
+ top_iph->check = 0;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+ ah->reserved = 0;
+ ah->spi = x->id.spi;
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+out_free:
+ kfree(iph);
+out:
+ return err;
+}
+
+static void ah_input_done(struct crypto_async_request *base, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ struct iphdr *work_iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
+static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int ah_hlen;
+ int ihl;
+ int nexthdr;
+ int nfrags;
+ u8 *auth_data;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *work_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ if (!pskb_may_pull(skb, sizeof(*ah)))
+ goto out;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ nexthdr = ah->nexthdr;
+ ah_hlen = (ah->hdrlen + 2) << 2;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, ah_hlen))
+ goto out;
+
+ /* We are going to _remove_ AH header to keep sockets happy,
+ * so... Later this can change. */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+ ahp->icv_trunc_len + seqhi_len);
+ if (!work_iph)
+ goto out;
+
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
+ icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memcpy(work_iph, iph, ihl);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ iph->ttl = 0;
+ iph->tos = 0;
+ iph->frag_off = 0;
+ iph->check = 0;
+ if (ihl > sizeof(*iph)) {
+ __be32 dummy;
+ err = ip_clear_mutable_options(iph, &dummy);
+ if (err)
+ goto out_free;
+ }
+
+ skb_push(skb, ihl);
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ goto out_free;
+ }
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out_free;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr;
+
+out_free:
+ kfree (work_iph);
+out:
+ return err;
+}
+
+static int ah4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ ah->spi, IPPROTO_AH, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int ah_init_state(struct xfrm_state *x)
+{
+ struct ah_data *ahp = NULL;
+ struct xfrm_algo_desc *aalg_desc;
+ struct crypto_ahash *ahash;
+
+ if (!x->aalg)
+ goto error;
+
+ if (x->encap)
+ goto error;
+
+ ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+ if (!ahp)
+ return -ENOMEM;
+
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash))
+ goto error;
+
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
+ goto error;
+
+ /*
+ * Lookup the algorithm description maintained by xfrm_algo,
+ * verify crypto transform properties, and store information
+ * we need for AH processing. This lookup cannot fail here
+ * after a successful crypto_alloc_ahash().
+ */
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_ahash_digestsize(ahash)) {
+ pr_info("%s: %s digestsize %u != %hu\n",
+ __func__, x->aalg->alg_name,
+ crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
+ goto error;
+ }
+
+ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ x->props.header_len += sizeof(struct iphdr);
+ x->data = ahp;
+
+ return 0;
+
+error:
+ if (ahp) {
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+ }
+ return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+ struct ah_data *ahp = x->data;
+
+ if (!ahp)
+ return;
+
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+}
+
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type ah_type =
+{
+ .description = "AH4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = ah_init_state,
+ .destructor = ah_destroy,
+ .input = ah_input,
+ .output = ah_output
+};
+
+static struct xfrm4_protocol ah4_protocol = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah4_rcv_cb,
+ .err_handler = ah4_err,
+ .priority = 0,
+};
+
+static int __init ah4_init(void)
+{
+ if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ah_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+ if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 000000000..933a92820
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1375 @@
+/* linux/net/ipv4/arp.c
+ *
+ * Copyright (C) 1994 by Florian La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox : Removed the Ethernet assumptions in
+ * Florian's code
+ * Alan Cox : Fixed some small errors in the ARP
+ * logic
+ * Alan Cox : Allow >4K in /proc
+ * Alan Cox : Make ARP add its own protocol entry
+ * Ross Martin : Rewrote arp_rcv() and arp_get_info()
+ * Stephen Henson : Add AX25 support to arp_get_info()
+ * Alan Cox : Drop data when a device is downed.
+ * Alan Cox : Use init_timer().
+ * Alan Cox : Double lock fixes.
+ * Martin Seine : Move the arphdr structure
+ * to if_arp.h for compatibility.
+ * with BSD based programs.
+ * Andrew Tridgell : Added ARP netmask code and
+ * re-arranged proxy handling.
+ * Alan Cox : Changed to use notifiers.
+ * Niibe Yutaka : Reply for this device or proxies only.
+ * Alan Cox : Don't proxy across hardware types!
+ * Jonathan Naylor : Added support for NET/ROM.
+ * Mike Shaver : RFC1122 checks.
+ * Jonathan Naylor : Only lookup the hardware address for
+ * the correct hardware type.
+ * Germano Caronni : Assorted subtle races.
+ * Craig Schlenter : Don't modify permanent entry
+ * during arp_rcv.
+ * Russ Nelson : Tidied up a few bits.
+ * Alexey Kuznetsov: Major changes to caching and behaviour,
+ * eg intelligent arp probing and
+ * generation
+ * of host down events.
+ * Alan Cox : Missing unlock in device events.
+ * Eckes : ARP ioctl control errors.
+ * Alexey Kuznetsov: Arp free fix.
+ * Manuel Rodriguez: Gratuitous ARP.
+ * Jonathan Layes : Added arpd support through kerneld
+ * message queue (960314)
+ * Mike Shaver : /proc/sys/net/ipv4/arp_* support
+ * Mike McLagan : Routing by source
+ * Stuart Cheshire : Metricom and grat arp fixes
+ * *** FOR 2.1 clean this up ***
+ * Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * folded into the mainstream FDDI code.
+ * Ack spit, Linus how did you allow that
+ * one in...
+ * Jes Sorensen : Make FDDI work again in 2.1.x and
+ * clean up the APFDDI & gen. FDDI bits.
+ * Alexey Kuznetsov: new arp state machine;
+ * now it is in net/core/neighbour.c.
+ * Krzysztof Halasa: Added Frame Relay ARP support.
+ * Arnaldo C. Melo : convert /proc/net/arp to seq_file
+ * Shmulik Hen: Split arp_send to arp_create and
+ * arp_xmit so intermediate drivers like
+ * bonding can change the skb before
+ * sending (e.g. insert 8021q tag).
+ * Harald Welte : convert to make use of jenkins hash
+ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ax25.h>
+#include <net/netrom.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ * Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static const struct neigh_ops arp_generic_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_connected_output,
+};
+
+static const struct neigh_ops arp_hh_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_resolve_output,
+};
+
+static const struct neigh_ops arp_direct_ops = {
+ .family = AF_INET,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
+};
+
+struct neigh_table arp_tbl = {
+ .family = AF_INET,
+ .key_len = 4,
+ .protocol = cpu_to_be16(ETH_P_IP),
+ .hash = arp_hash,
+ .key_eq = arp_key_eq,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .reachable_time = 30 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+EXPORT_SYMBOL(arp_tbl);
+
+int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ ip_eth_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_INFINIBAND:
+ ip_ib_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ case ARPHRD_IPGRE:
+ ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ default:
+ if (dir) {
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
+{
+ return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq32(neigh, pkey);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+ __be32 addr = *(__be32 *)neigh->primary_key;
+ struct net_device *dev = neigh->dev;
+ struct in_device *in_dev;
+ struct neigh_parms *parms;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ neigh->type = inet_addr_type(dev_net(dev), addr);
+
+ parms = in_dev->arp_parms;
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+ rcu_read_unlock();
+
+ if (!dev->header_ops) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &arp_direct_ops;
+ neigh->output = neigh_direct_output;
+ } else {
+ /* Good devices (checked by reading texts, but only Ethernet is
+ tested)
+
+ ARPHRD_ETHER: (ethernet, apfddi)
+ ARPHRD_FDDI: (fddi)
+ ARPHRD_IEEE802: (tr)
+ ARPHRD_METRICOM: (strip)
+ ARPHRD_ARCNET:
+ etc. etc. etc.
+
+ ARPHRD_IPDDP will also work, if author repairs it.
+ I did not it, because this driver does not work even
+ in old paradigm.
+ */
+
+ if (neigh->type == RTN_MULTICAST) {
+ neigh->nud_state = NUD_NOARP;
+ arp_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+
+ if (dev->header_ops->cache)
+ neigh->ops = &arp_hh_ops;
+ else
+ neigh->ops = &arp_generic_ops;
+
+ if (neigh->nud_state & NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+ return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ __be32 saddr = 0;
+ u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+ struct net_device *dev = neigh->dev;
+ __be32 target = *(__be32 *)neigh->primary_key;
+ int probes = atomic_read(&neigh->probes);
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return;
+ }
+ switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+ default:
+ case 0: /* By default announce any local IP */
+ if (skb && inet_addr_type(dev_net(dev),
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
+ saddr = ip_hdr(skb)->saddr;
+ break;
+ case 1: /* Restrict announcements of saddr in same subnet */
+ if (!skb)
+ break;
+ saddr = ip_hdr(skb)->saddr;
+ if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+ /* saddr should be known to target */
+ if (inet_addr_onlink(in_dev, target, saddr))
+ break;
+ }
+ saddr = 0;
+ break;
+ case 2: /* Avoid secondary IPs, get a primary/preferred one */
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!saddr)
+ saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID))
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
+ neigh_ha_snapshot(dst_ha, neigh, dev);
+ dst_hw = dst_ha;
+ } else {
+ probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
+ if (probes < 0) {
+ neigh_app_ns(neigh);
+ return;
+ }
+ }
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL);
+}
+
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
+{
+ struct net *net = dev_net(in_dev->dev);
+ int scope;
+
+ switch (IN_DEV_ARP_IGNORE(in_dev)) {
+ case 0: /* Reply, the tip is already validated */
+ return 0;
+ case 1: /* Reply only if tip is configured on the incoming interface */
+ sip = 0;
+ scope = RT_SCOPE_HOST;
+ break;
+ case 2: /*
+ * Reply only if tip is configured on the incoming interface
+ * and is in same subnet as sip
+ */
+ scope = RT_SCOPE_HOST;
+ break;
+ case 3: /* Do not reply for scope host addresses */
+ sip = 0;
+ scope = RT_SCOPE_LINK;
+ in_dev = NULL;
+ break;
+ case 4: /* Reserved */
+ case 5:
+ case 6:
+ case 7:
+ return 0;
+ case 8: /* Do not reply */
+ return 1;
+ default:
+ return 0;
+ }
+ return !inet_confirm_addr(net, in_dev, sip, tip, scope);
+}
+
+static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
+{
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+ struct net *net = dev_net(dev);
+
+ rt = ip_route_output(net, sip, tip, 0, 0);
+ if (IS_ERR(rt))
+ return 1;
+ if (rt->dst.dev != dev) {
+ NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ ip_rt_put(rt);
+ return flag;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt)
+{
+ struct in_device *out_dev;
+ int imi, omi = -1;
+
+ if (rt->dst.dev == dev)
+ return 0;
+
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
+ return 1;
+ if (imi == -1)
+ return 0;
+
+ /* place to check for proxy_arp for routes */
+
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
+ omi = IN_DEV_MEDIUM_ID(out_dev);
+
+ return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface. This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router. As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ * This technology is known by different names:
+ * In RFC 3069 it is called VLAN Aggregation.
+ * Cisco and Allied Telesyn call it Private VLAN.
+ * Hewlett-Packard call it Source-Port filtering or port-isolation.
+ * Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt,
+ __be32 sip, __be32 tip)
+{
+ /* Private VLAN is only concerned about the same ethernet segment */
+ if (rt->dst.dev != dev)
+ return 0;
+
+ /* Don't reply on self probes (often done by windowz boxes)*/
+ if (sip == tip)
+ return 0;
+
+ if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ * Create an arp packet. If dest_hw is not set, we create a broadcast
+ * message.
+ */
+struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+
+ /*
+ * Allocate a buffer
+ */
+
+ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+ arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_ARP);
+ if (!src_hw)
+ src_hw = dev->dev_addr;
+ if (!dest_hw)
+ dest_hw = dev->broadcast;
+
+ /*
+ * Fill the device header for the ARP frame
+ */
+ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
+ goto out;
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * The arp hardware type should match the device type, except for FDDI,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+ /*
+ * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+ * DIX code for the protocol. Make these device structure fields.
+ */
+ switch (dev->type) {
+ default:
+ arp->ar_hrd = htons(dev->type);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+
+#if IS_ENABLED(CONFIG_AX25)
+ case ARPHRD_AX25:
+ arp->ar_hrd = htons(ARPHRD_AX25);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+
+#if IS_ENABLED(CONFIG_NETROM)
+ case ARPHRD_NETROM:
+ arp->ar_hrd = htons(ARPHRD_NETROM);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+#endif
+#endif
+
+#if IS_ENABLED(CONFIG_FDDI)
+ case ARPHRD_FDDI:
+ arp->ar_hrd = htons(ARPHRD_ETHER);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+ }
+
+ arp->ar_hln = dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr = (unsigned char *)(arp + 1);
+
+ memcpy(arp_ptr, src_hw, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ memcpy(arp_ptr, &src_ip, 4);
+ arp_ptr += 4;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
+ memcpy(arp_ptr, &dest_ip, 4);
+
+ return skb;
+
+out:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(arp_create);
+
+/*
+ * Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
+ NULL, skb->dev, dev_queue_xmit_sk);
+}
+EXPORT_SYMBOL(arp_xmit);
+
+/*
+ * Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+
+ /*
+ * No arp on this interface.
+ */
+
+ if (dev->flags&IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ arp_xmit(skb);
+}
+EXPORT_SYMBOL(arp_send);
+
+/*
+ * Process an arp request.
+ */
+
+static int arp_process(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ struct rtable *rt;
+ unsigned char *sha;
+ __be32 sip, tip;
+ u16 dev_type = dev->type;
+ int addr_type;
+ struct neighbour *n;
+ struct net *net = dev_net(dev);
+ bool is_garp = false;
+
+ /* arp_rcv below verifies the ARP header and verifies the device
+ * is ARP'able.
+ */
+
+ if (!in_dev)
+ goto out;
+
+ arp = arp_hdr(skb);
+
+ switch (dev_type) {
+ default:
+ if (arp->ar_pro != htons(ETH_P_IP) ||
+ htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ /*
+ * ETHERNET, and Fibre Channel (which are IEEE 802
+ * devices, according to RFC 2625) devices will accept ARP
+ * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+ * This is the case also of FDDI, where the RFC 1390 says that
+ * FDDI devices should accept ARP hardware of (1) Ethernet,
+ * however, to be more robust, we'll accept both 1 (Ethernet)
+ * or 6 (IEEE 802.2)
+ */
+ if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+ arp->ar_pro != htons(ETH_P_IP))
+ goto out;
+ break;
+ case ARPHRD_AX25:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_AX25))
+ goto out;
+ break;
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_NETROM))
+ goto out;
+ break;
+ }
+
+ /* Understand only these message types */
+
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ goto out;
+
+/*
+ * Extract fields
+ */
+ arp_ptr = (unsigned char *)(arp + 1);
+ sha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ arp_ptr += dev->addr_len;
+ }
+ memcpy(&tip, arp_ptr, 4);
+/*
+ * Check for bad requests for 127.x.x.x and requests for multicast
+ * addresses. If this is one such, delete it.
+ */
+ if (ipv4_is_multicast(tip) ||
+ (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
+ goto out;
+
+/*
+ * Special case: We must set Frame Relay source Q.922 address
+ */
+ if (dev_type == ARPHRD_DLCI)
+ sha = dev->broadcast;
+
+/*
+ * Process entry. The idea here is we want to send a reply if it is a
+ * request for us or if it is a request for someone else that we hold
+ * a proxy for. We want to add an entry to our cache if it is a reply
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
+ * our cache, since ours is not in their cache.)
+ *
+ * Putting this another way, we only care about replies if they are to
+ * us, in which case we add them to the cache. For requests, we care
+ * about those for us and those for our proxies. We reply to both,
+ * and in the case of requests for us we add the requester to the arp
+ * cache.
+ */
+
+ /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+ if (sip == 0) {
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ inet_addr_type(net, tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev, sip, tip))
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+ dev->dev_addr, sha);
+ goto out;
+ }
+
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+
+ rt = skb_rtable(skb);
+ addr_type = rt->rt_type;
+
+ if (addr_type == RTN_LOCAL) {
+ int dont_send;
+
+ dont_send = arp_ignore(in_dev, sip, tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send = arp_filter(sip, tip, dev);
+ if (!dont_send) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ neigh_release(n);
+ }
+ }
+ goto out;
+ } else if (IN_DEV_FORWARD(in_dev)) {
+ if (addr_type == RTN_UNICAST &&
+ (arp_fwd_proxy(in_dev, dev, rt) ||
+ arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n)
+ neigh_release(n);
+
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+ skb->pkt_type == PACKET_HOST ||
+ NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ } else {
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
+ return 0;
+ }
+ goto out;
+ }
+ }
+ }
+
+ /* Update our ARP tables */
+
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ /* Unsolicited ARP is not accepted by default.
+ It is possible, that this option should be enabled for some
+ devices (strip is candidate)
+ */
+ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
+ inet_addr_type(net, sip) == RTN_UNICAST;
+
+ if (!n &&
+ ((arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
+ }
+
+ if (n) {
+ int state = NUD_REACHABLE;
+ int override;
+
+ /* If several different ARP replies follows back-to-back,
+ use the FIRST one. It is possible, if several proxy
+ agents are active. Taking the first reply prevents
+ arp trashing and chooses the fastest router.
+ */
+ override = time_after(jiffies,
+ n->updated +
+ NEIGH_VAR(n->parms, LOCKTIME)) ||
+ is_garp;
+
+ /* Broadcast replies and request packets
+ do not assert neighbour reachability.
+ */
+ if (arp->ar_op != htons(ARPOP_REPLY) ||
+ skb->pkt_type != PACKET_HOST)
+ state = NUD_STALE;
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_release(n);
+ }
+
+out:
+ consume_skb(skb);
+ return 0;
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+ arp_process(NULL, skb);
+}
+
+
+/*
+ * Receive an arp request from the device layer.
+ */
+
+static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct arphdr *arp;
+
+ /* do not tweak dropwatch on an ARP we will ignore */
+ if (dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK)
+ goto consumeskb;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out_of_mem;
+
+ /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ goto freeskb;
+
+ arp = arp_hdr(skb);
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
+ goto freeskb;
+
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
+ dev, NULL, arp_process);
+
+consumeskb:
+ consume_skb(skb);
+ return 0;
+freeskb:
+ kfree_skb(skb);
+out_of_mem:
+ return 0;
+}
+
+/*
+ * User level interface (ioctl)
+ */
+
+/*
+ * Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
+{
+ if (!dev) {
+ IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ return 0;
+ }
+ return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask && mask != htonl(0xFFFFFFFF))
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+ r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
+ return -ENOBUFS;
+ return 0;
+ }
+
+ return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+ struct neighbour *neigh;
+ int err;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_set_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FDDI)
+ case ARPHRD_FDDI:
+ /*
+ * According to RFC 1390, FDDI devices should accept ARP
+ * hardware types of 1 (Ethernet). However, to be more
+ * robust, we'll accept hardware types of either 1 (Ethernet)
+ * or 6 (IEEE 802.2).
+ */
+ if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+ r->arp_ha.sa_family != ARPHRD_ETHER &&
+ r->arp_ha.sa_family != ARPHRD_IEEE802)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ if (r->arp_ha.sa_family != dev->type)
+ return -EINVAL;
+ break;
+ }
+
+ neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+ err = PTR_ERR(neigh);
+ if (!IS_ERR(neigh)) {
+ unsigned int state = NUD_STALE;
+ if (r->arp_flags & ATF_PERM)
+ state = NUD_PERMANENT;
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
+{
+ if (neigh->nud_state&NUD_PERMANENT)
+ return ATF_PERM | ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ return ATF_COM;
+ else
+ return 0;
+}
+
+/*
+ * Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err = -ENXIO;
+
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ neigh_release(neigh);
+ err = 0;
+ }
+ return err;
+}
+
+static int arp_invalidate(struct net_device *dev, __be32 ip)
+{
+ struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ int err = -ENXIO;
+
+ if (neigh) {
+ if (neigh->nud_state & ~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+
+ return err;
+}
+
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask == htonl(0xFFFFFFFF))
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+ if (mask)
+ return -EINVAL;
+
+ return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_delete_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ return arp_invalidate(dev, ip);
+}
+
+/*
+ * Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct arpreq r;
+ struct net_device *dev = NULL;
+
+ switch (cmd) {
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ if (!(r.arp_flags & ATF_PUBL) &&
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
+ return -EINVAL;
+ if (!(r.arp_flags & ATF_NETMASK))
+ ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+ htonl(0xFFFFFFFFUL);
+ rtnl_lock();
+ if (r.arp_dev[0]) {
+ err = -ENODEV;
+ dev = __dev_get_by_name(net, r.arp_dev);
+ if (!dev)
+ goto out;
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+ if (!r.arp_ha.sa_family)
+ r.arp_ha.sa_family = dev->type;
+ err = -EINVAL;
+ if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+ goto out;
+ } else if (cmd == SIOCGARP) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ switch (cmd) {
+ case SIOCDARP:
+ err = arp_req_delete(net, &r, dev);
+ break;
+ case SIOCSARP:
+ err = arp_req_set(net, &r, dev);
+ break;
+ case SIOCGARP:
+ err = arp_req_get(&r, dev);
+ break;
+ }
+out:
+ rtnl_unlock();
+ if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
+ return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
+
+ switch (event) {
+ case NETDEV_CHANGEADDR:
+ neigh_changeaddr(&arp_tbl, dev);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&arp_tbl, dev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+ .notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+ It is necessary, that this routine was called after route cache will be
+ flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+ neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ * Called once on startup.
+ */
+
+static struct packet_type arp_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_ARP),
+ .func = arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
+
+ dev_add_pack(&arp_packet_type);
+ arp_proc_init();
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if IS_ENABLED(CONFIG_AX25)
+
+/* ------------------------------------------------------------------------ */
+/*
+ * ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ')
+ *s++ = c;
+ }
+
+ *s++ = '-';
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+ struct neighbour *n)
+{
+ char hbuffer[HBUFFERLEN];
+ int k, j;
+ char tbuf[16];
+ struct net_device *dev = n->dev;
+ int hatype = dev->type;
+
+ read_lock(&n->lock);
+ /* Convert hardware address to XX:XX:XX:XX ... form. */
+#if IS_ENABLED(CONFIG_AX25)
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ ax2asc2((ax25_address *)n->ha, hbuffer);
+ else {
+#endif
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+ hbuffer[k++] = hex_asc_hi(n->ha[j]);
+ hbuffer[k++] = hex_asc_lo(n->ha[j]);
+ hbuffer[k++] = ':';
+ }
+ if (k != 0)
+ --k;
+ hbuffer[k] = 0;
+#if IS_ENABLED(CONFIG_AX25)
+ }
+#endif
+ sprintf(tbuf, "%pI4", n->primary_key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+ read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+ struct pneigh_entry *n)
+{
+ struct net_device *dev = n->dev;
+ int hatype = dev ? dev->type : 0;
+ char tbuf[16];
+
+ sprintf(tbuf, "%pI4", n->key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+ dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "IP address HW type Flags "
+ "HW address Mask Device\n");
+ } else {
+ struct neigh_seq_state *state = seq->private;
+
+ if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+ arp_format_pneigh_entry(seq, v);
+ else
+ arp_format_neigh_entry(seq, v);
+ }
+
+ return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* Don't want to confuse "arp -a" w/ magic entries,
+ * so we tell the generic iterator to skip NUD_NOARP.
+ */
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static const struct seq_operations arp_seq_ops = {
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &arp_seq_ops,
+ sizeof(struct neigh_seq_state));
+}
+
+static const struct file_operations arp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = arp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init arp_net_init(struct net *net)
+{
+ if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit arp_net_exit(struct net *net)
+{
+ remove_proc_entry("arp", net->proc_net);
+}
+
+static struct pernet_operations arp_net_ops = {
+ .init = arp_net_init,
+ .exit = arp_net_exit,
+};
+
+static int __init arp_proc_init(void)
+{
+ return register_pernet_subsys(&arp_net_ops);
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
new file mode 100644
index 000000000..bdb2a07ec
--- /dev/null
+++ b/net/ipv4/cipso_ipv4.c
@@ -0,0 +1,2357 @@
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+
+/* List of available DOI definitions */
+/* XXX - This currently assumes a minimal number of different DOIs in use,
+ * if in practice there are a lot of different DOIs this list should
+ * probably be turned into a hash table or something similar so we
+ * can do quick lookups. */
+static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static LIST_HEAD(cipso_v4_doi_list);
+
+/* Label mapping cache */
+int cipso_v4_cache_enabled = 1;
+int cipso_v4_cache_bucketsize = 10;
+#define CIPSO_V4_CACHE_BUCKETBITS 7
+#define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS)
+#define CIPSO_V4_CACHE_REORDERLIMIT 10
+struct cipso_v4_map_cache_bkt {
+ spinlock_t lock;
+ u32 size;
+ struct list_head list;
+};
+
+struct cipso_v4_map_cache_entry {
+ u32 hash;
+ unsigned char *key;
+ size_t key_len;
+
+ struct netlbl_lsm_cache *lsm_data;
+
+ u32 activity;
+ struct list_head list;
+};
+
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache;
+
+/* Restricted bitmap (tag #1) flags */
+int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_strictvalid = 1;
+
+/*
+ * Protocol Constants
+ */
+
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX 40
+
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN 6
+
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN 4
+
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN 4
+
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN 4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5). You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possible to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX 8
+
+/* Base length of the local tag (non-standard tag).
+ * Tag definition (may change between kernel versions)
+ *
+ * 0 8 16 24 32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN 6
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end. Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
+ u32 bitmap_len,
+ u32 offset,
+ u8 state)
+{
+ u32 bit_spot;
+ u32 byte_offset;
+ unsigned char bitmask;
+ unsigned char byte;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_offset = offset / 8;
+ byte = bitmap[byte_offset];
+ bit_spot = offset;
+ bitmask = 0x80 >> (offset % 8);
+
+ while (bit_spot < bitmap_len) {
+ if ((state && (byte & bitmask) == bitmask) ||
+ (state == 0 && (byte & bitmask) == 0))
+ return bit_spot;
+
+ bit_spot++;
+ bitmask >>= 1;
+ if (bitmask == 0) {
+ byte = bitmap[++byte_offset];
+ bitmask = 0x80;
+ }
+ }
+
+ return -1;
+}
+
+/**
+ * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask. Returns zero on success, negative values
+ * on error.
+ */
+static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
+ u32 bit,
+ u8 state)
+{
+ u32 byte_spot;
+ u8 bitmask;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_spot = bit / 8;
+ bitmask = 0x80 >> (bit % 8);
+ if (state)
+ bitmap[byte_spot] |= bitmask;
+ else
+ bitmap[byte_spot] &= ~bitmask;
+}
+
+/**
+ * cipso_v4_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
+{
+ if (entry->lsm_data)
+ netlbl_secattr_cache_free(entry->lsm_data);
+ kfree(entry->key);
+ kfree(entry);
+}
+
+/**
+ * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CIPSO tag hashing function. Returns a 32-bit hash value.
+ *
+ */
+static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+/**
+ * cipso_v4_cache_init - Initialize the CIPSO cache
+ *
+ * Description:
+ * Initializes the CIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file. Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init cipso_v4_cache_init(void)
+{
+ u32 iter;
+
+ cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
+ sizeof(struct cipso_v4_map_cache_bkt),
+ GFP_KERNEL);
+ if (!cipso_v4_cache)
+ return -ENOMEM;
+
+ for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+ spin_lock_init(&cipso_v4_cache[iter].lock);
+ cipso_v4_cache[iter].size = 0;
+ INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void cipso_v4_cache_invalidate(void)
+{
+ struct cipso_v4_map_cache_entry *entry, *tmp_entry;
+ u32 iter;
+
+ for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+ spin_lock_bh(&cipso_v4_cache[iter].lock);
+ list_for_each_entry_safe(entry,
+ tmp_entry,
+ &cipso_v4_cache[iter].list, list) {
+ list_del(&entry->list);
+ cipso_v4_cache_entry_free(entry);
+ }
+ cipso_v4_cache[iter].size = 0;
+ spin_unlock_bh(&cipso_v4_cache[iter].lock);
+ }
+}
+
+/**
+ * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key. If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes. The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ * 1. The cache entry's activity counter is incremented
+ * 2. The previous (higher ranking) entry's activity counter is decremented
+ * 3. If the difference between the two activity counters is geater than
+ * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int cipso_v4_cache_check(const unsigned char *key,
+ u32 key_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ u32 bkt;
+ struct cipso_v4_map_cache_entry *entry;
+ struct cipso_v4_map_cache_entry *prev_entry = NULL;
+ u32 hash;
+
+ if (!cipso_v4_cache_enabled)
+ return -ENOENT;
+
+ hash = cipso_v4_map_cache_hash(key, key_len);
+ bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+ spin_lock_bh(&cipso_v4_cache[bkt].lock);
+ list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
+ if (entry->hash == hash &&
+ entry->key_len == key_len &&
+ memcmp(entry->key, key, key_len) == 0) {
+ entry->activity += 1;
+ atomic_inc(&entry->lsm_data->refcount);
+ secattr->cache = entry->lsm_data;
+ secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+ if (!prev_entry) {
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+ return 0;
+ }
+
+ if (prev_entry->activity > 0)
+ prev_entry->activity -= 1;
+ if (entry->activity > prev_entry->activity &&
+ entry->activity - prev_entry->activity >
+ CIPSO_V4_CACHE_REORDERLIMIT) {
+ __list_del(entry->list.prev, entry->list.next);
+ __list_add(&entry->list,
+ prev_entry->list.prev,
+ &prev_entry->list);
+ }
+
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+ return 0;
+ }
+ prev_entry = entry;
+ }
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+ return -ENOENT;
+}
+
+/**
+ * cipso_v4_cache_add - Add an entry to the CIPSO cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CIPSO label mapping cache. Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first. It is important to note that there is
+ * currently no checking for duplicate keys. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int cipso_v4_cache_add(const unsigned char *cipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ u32 bkt;
+ struct cipso_v4_map_cache_entry *entry = NULL;
+ struct cipso_v4_map_cache_entry *old_entry = NULL;
+ u32 cipso_ptr_len;
+
+ if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ return 0;
+
+ cipso_ptr_len = cipso_ptr[1];
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+ entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
+ if (!entry->key) {
+ ret_val = -ENOMEM;
+ goto cache_add_failure;
+ }
+ entry->key_len = cipso_ptr_len;
+ entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
+ atomic_inc(&secattr->cache->refcount);
+ entry->lsm_data = secattr->cache;
+
+ bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+ spin_lock_bh(&cipso_v4_cache[bkt].lock);
+ if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ list_add(&entry->list, &cipso_v4_cache[bkt].list);
+ cipso_v4_cache[bkt].size += 1;
+ } else {
+ old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
+ struct cipso_v4_map_cache_entry, list);
+ list_del(&old_entry->list);
+ list_add(&entry->list, &cipso_v4_cache[bkt].list);
+ cipso_v4_cache_entry_free(old_entry);
+ }
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+ return 0;
+
+cache_add_failure:
+ if (entry)
+ cipso_v4_cache_entry_free(entry);
+ return ret_val;
+}
+
+/*
+ * DOI List Functions
+ */
+
+/**
+ * cipso_v4_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
+{
+ struct cipso_v4_doi *iter;
+
+ list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+ if (iter->doi == doi && atomic_read(&iter->refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see cipso_ipv4.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ u32 iter;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+ goto doi_add_return;
+ for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
+ switch (doi_def->tags[iter]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ case CIPSO_V4_TAG_ENUM:
+ if (doi_def->type != CIPSO_V4_MAP_PASS)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_INVALID:
+ if (iter == 0)
+ goto doi_add_return;
+ break;
+ default:
+ goto doi_add_return;
+ }
+ }
+
+ atomic_set(&doi_def->refcount, 1);
+
+ spin_lock(&cipso_v4_doi_list_lock);
+ if (cipso_v4_doi_search(doi_def->doi)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
+ list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+ if (audit_buf) {
+ const char *type_str;
+ switch (doi_type) {
+ case CIPSO_V4_MAP_TRANS:
+ type_str = "trans";
+ break;
+ case CIPSO_V4_MAP_PASS:
+ type_str = "pass";
+ break;
+ case CIPSO_V4_MAP_LOCAL:
+ type_str = "local";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " cipso_doi=%u cipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_TRANS:
+ kfree(doi_def->map.std->lvl.cipso);
+ kfree(doi_def->map.std->lvl.local);
+ kfree(doi_def->map.std->cat.cipso);
+ kfree(doi_def->map.std->cat.local);
+ break;
+ }
+ kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+ struct cipso_v4_doi *doi_def;
+
+ doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+ cipso_v4_doi_free(doi_def);
+}
+
+/**
+ * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct cipso_v4_doi *doi_def;
+ struct audit_buffer *audit_buf;
+
+ spin_lock(&cipso_v4_doi_list_lock);
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -ENOENT;
+ goto doi_remove_return;
+ }
+ if (!atomic_dec_and_test(&doi_def->refcount)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EBUSY;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+ if (audit_buf) {
+ audit_log_format(audit_buf,
+ " cipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
+ *
+ */
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+ struct cipso_v4_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def)
+ goto doi_getdef_return;
+ if (!atomic_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ if (!atomic_dec_and_test(&doi_def->refcount))
+ return;
+ spin_lock(&cipso_v4_doi_list_lock);
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+}
+
+/**
+ * cipso_v4_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+int cipso_v4_doi_walk(u32 *skip_cnt,
+ int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 doi_cnt = 0;
+ struct cipso_v4_doi *iter_doi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
+ if (atomic_read(&iter_doi->refcount) > 0) {
+ if (doi_cnt++ < *skip_cnt)
+ continue;
+ ret_val = callback(iter_doi, cb_arg);
+ if (ret_val < 0) {
+ doi_cnt--;
+ goto doi_walk_return;
+ }
+ }
+
+doi_walk_return:
+ rcu_read_unlock();
+ *skip_cnt = doi_cnt;
+ return ret_val;
+}
+
+/*
+ * Label Mapping Functions
+ */
+
+/**
+ * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
+ * @doi_def: the DOI definition
+ * @level: the level to check
+ *
+ * Description:
+ * Checks the given level against the given DOI definition and returns a
+ * negative value if the level does not have a valid mapping and a zero value
+ * if the level is defined by the DOI.
+ *
+ */
+static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
+{
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+ return 0;
+ break;
+ }
+
+ return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
+ * @doi_def: the DOI definition
+ * @host_lvl: the host MLS level
+ * @net_lvl: the network/CIPSO MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS level to the correct
+ * CIPSO level using the given DOI definition. Returns zero on success,
+ * negative values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
+ u32 host_lvl,
+ u32 *net_lvl)
+{
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ *net_lvl = host_lvl;
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ if (host_lvl < doi_def->map.std->lvl.local_size &&
+ doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
+ *net_lvl = doi_def->map.std->lvl.local[host_lvl];
+ return 0;
+ }
+ return -EPERM;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
+ * @doi_def: the DOI definition
+ * @net_lvl: the network/CIPSO MLS level
+ * @host_lvl: the host MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO level to the correct local MLS
+ * level using the given DOI definition. Returns zero on success, negative
+ * values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
+ u32 net_lvl,
+ u32 *host_lvl)
+{
+ struct cipso_v4_std_map_tbl *map_tbl;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ *host_lvl = net_lvl;
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ map_tbl = doi_def->map.std;
+ if (net_lvl < map_tbl->lvl.cipso_size &&
+ map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
+ *host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
+ return 0;
+ }
+ return -EPERM;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
+ * @doi_def: the DOI definition
+ * @bitmap: category bitmap
+ * @bitmap_len: bitmap length in bytes
+ *
+ * Description:
+ * Checks the given category bitmap against the given DOI definition and
+ * returns a negative value if any of the categories in the bitmap do not have
+ * a valid mapping and a zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *bitmap,
+ u32 bitmap_len)
+{
+ int cat = -1;
+ u32 bitmap_len_bits = bitmap_len * 8;
+ u32 cipso_cat_size;
+ u32 *cipso_array;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ cipso_cat_size = doi_def->map.std->cat.cipso_size;
+ cipso_array = doi_def->map.std->cat.cipso;
+ for (;;) {
+ cat = cipso_v4_bitmap_walk(bitmap,
+ bitmap_len_bits,
+ cat + 1,
+ 1);
+ if (cat < 0)
+ break;
+ if (cat >= cipso_cat_size ||
+ cipso_array[cat] >= CIPSO_V4_INV_CAT)
+ return -EFAULT;
+ }
+
+ if (cat == -1)
+ return 0;
+ break;
+ }
+
+ return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO bitmap using the given DOI definition. Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int host_spot = -1;
+ u32 net_spot = CIPSO_V4_INV_CAT;
+ u32 net_spot_max = 0;
+ u32 net_clen_bits = net_cat_len * 8;
+ u32 host_cat_size = 0;
+ u32 *host_cat_array = NULL;
+
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+ host_cat_size = doi_def->map.std->cat.local_size;
+ host_cat_array = doi_def->map.std->cat.local;
+ }
+
+ for (;;) {
+ host_spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+ host_spot + 1);
+ if (host_spot < 0)
+ break;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ net_spot = host_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (host_spot >= host_cat_size)
+ return -EPERM;
+ net_spot = host_cat_array[host_spot];
+ if (net_spot >= CIPSO_V4_INV_CAT)
+ return -EPERM;
+ break;
+ }
+ if (net_spot >= net_clen_bits)
+ return -ENOSPC;
+ cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+
+ if (net_spot > net_spot_max)
+ net_spot_max = net_spot;
+ }
+
+ if (++net_spot_max % 8)
+ return net_spot_max / 8 + 1;
+ return net_spot_max / 8;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ int net_spot = -1;
+ u32 host_spot = CIPSO_V4_INV_CAT;
+ u32 net_clen_bits = net_cat_len * 8;
+ u32 net_cat_size = 0;
+ u32 *net_cat_array = NULL;
+
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+ net_cat_size = doi_def->map.std->cat.cipso_size;
+ net_cat_array = doi_def->map.std->cat.cipso;
+ }
+
+ for (;;) {
+ net_spot = cipso_v4_bitmap_walk(net_cat,
+ net_clen_bits,
+ net_spot + 1,
+ 1);
+ if (net_spot < 0) {
+ if (net_spot == -2)
+ return -EFAULT;
+ return 0;
+ }
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ host_spot = net_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (net_spot >= net_cat_size)
+ return -EPERM;
+ host_spot = net_cat_array[net_spot];
+ if (host_spot >= CIPSO_V4_INV_CAT)
+ return -EPERM;
+ break;
+ }
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ host_spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @enumcat: category list
+ * @enumcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *enumcat,
+ u32 enumcat_len)
+{
+ u16 cat;
+ int cat_prev = -1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < enumcat_len; iter += 2) {
+ cat = get_unaligned_be16(&enumcat[iter]);
+ if (cat <= cat_prev)
+ return -EFAULT;
+ cat_prev = cat;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int cat = -1;
+ u32 cat_iter = 0;
+
+ for (;;) {
+ cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1);
+ if (cat < 0)
+ break;
+ if ((cat_iter + 2) > net_cat_len)
+ return -ENOSPC;
+
+ *((__be16 *)&net_cat[cat_iter]) = htons(cat);
+ cat_iter += 2;
+ }
+
+ return cat_iter;
+}
+
+/**
+ * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ for (iter = 0; iter < net_cat_len; iter += 2) {
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ get_unaligned_be16(&net_cat[iter]),
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @rngcat: category list
+ * @rngcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *rngcat,
+ u32 rngcat_len)
+{
+ u16 cat_high;
+ u16 cat_low;
+ u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < rngcat_len; iter += 4) {
+ cat_high = get_unaligned_be16(&rngcat[iter]);
+ if ((iter + 4) <= rngcat_len)
+ cat_low = get_unaligned_be16(&rngcat[iter + 2]);
+ else
+ cat_low = 0;
+
+ if (cat_high > cat_prev)
+ return -EFAULT;
+
+ cat_prev = cat_low;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int iter = -1;
+ u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
+ u32 array_cnt = 0;
+ u32 cat_size = 0;
+
+ /* make sure we don't overflow the 'array[]' variable */
+ if (net_cat_len >
+ (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+ return -ENOSPC;
+
+ for (;;) {
+ iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1);
+ if (iter < 0)
+ break;
+ cat_size += (iter == 0 ? 0 : sizeof(u16));
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+
+ iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter);
+ if (iter < 0)
+ return -EFAULT;
+ cat_size += sizeof(u16);
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+ }
+
+ for (iter = 0; array_cnt > 0;) {
+ *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
+ iter += 2;
+ array_cnt--;
+ if (array[array_cnt] != 0) {
+ *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
+ iter += 2;
+ }
+ }
+
+ return cat_size;
+}
+
+/**
+ * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 net_iter;
+ u16 cat_low;
+ u16 cat_high;
+
+ for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
+ cat_high = get_unaligned_be16(&net_cat[net_iter]);
+ if ((net_iter + 4) <= net_cat_len)
+ cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
+ else
+ cat_low = 0;
+
+ ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat,
+ cat_low,
+ cat_high,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/*
+ * Protocol Handling Functions
+ */
+
+/**
+ * cipso_v4_gentag_hdr - Generate a CIPSO option header
+ * @doi_def: the DOI definition
+ * @len: the total tag length in bytes, not including this header
+ * @buf: the CIPSO option buffer
+ *
+ * Description:
+ * Write a CIPSO header into the beginning of @buffer.
+ *
+ */
+static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
+ unsigned char *buf,
+ u32 len)
+{
+ buf[0] = IPOPT_CIPSO;
+ buf[1] = CIPSO_V4_HDR_LEN + len;
+ *(__be32 *)&buf[2] = htonl(doi_def->doi);
+}
+
+/**
+ * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The
+ * actual buffer length may be larger than the indicated size due to
+ * translation between host and network category bitmaps. Returns the size of
+ * the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ /* This will send packets using the "optimized" format when
+ * possible as specified in section 3.4.2.6 of the
+ * CIPSO draft. */
+ if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ tag_len = 14;
+ else
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RBITMAP;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_enum_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_ENUM;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO enumerated tag (tag type #2) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the ranged tag, tag type #5. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rng_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RANGE;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
+ * in @secattr. Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag. Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ if (!(secattr->flags & NETLBL_SECATTR_SECID))
+ return -EPERM;
+
+ buffer[0] = CIPSO_V4_TAG_LOCAL;
+ buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+ *(u32 *)&buffer[2] = secattr->attr.secid;
+
+ return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ secattr->attr.secid = *(u32 *)&tag[2];
+ secattr->flags |= NETLBL_SECATTR_SECID;
+
+ return 0;
+}
+
+/**
+ * cipso_v4_optptr - Find the CIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CIPSO option. Returns a pointer
+ * to the start of the CIPSO option on success, NULL if one if not found.
+ *
+ */
+unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+ int optlen;
+ int taglen;
+
+ for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+ if (optptr[0] == IPOPT_CIPSO)
+ return optptr;
+ taglen = optptr[1];
+ optlen -= taglen;
+ optptr += taglen;
+ }
+
+ return NULL;
+}
+
+/**
+ * cipso_v4_validate - Validate a CIPSO option
+ * @option: the start of the option, on error it is set to point to the error
+ *
+ * Description:
+ * This routine is called to validate a CIPSO option, it checks all of the
+ * fields to ensure that they are at least valid, see the draft snippet below
+ * for details. If the option is valid then a zero value is returned and
+ * the value of @option is unchanged. If the option is invalid then a
+ * non-zero value is returned and @option is adjusted to point to the
+ * offending portion of the option. From the IETF draft ...
+ *
+ * "If any field within the CIPSO options, such as the DOI identifier, is not
+ * recognized the IP datagram is discarded and an ICMP 'parameter problem'
+ * (type 12) is generated and returned. The ICMP code field is set to 'bad
+ * parameter' (code 0) and the pointer is set to the start of the CIPSO field
+ * that is unrecognized."
+ *
+ */
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
+{
+ unsigned char *opt = *option;
+ unsigned char *tag;
+ unsigned char opt_iter;
+ unsigned char err_offset = 0;
+ u8 opt_len;
+ u8 tag_len;
+ struct cipso_v4_doi *doi_def = NULL;
+ u32 tag_iter;
+
+ /* caller already checks for length values that are too large */
+ opt_len = opt[1];
+ if (opt_len < 8) {
+ err_offset = 1;
+ goto validate_return;
+ }
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
+ if (!doi_def) {
+ err_offset = 2;
+ goto validate_return_locked;
+ }
+
+ opt_iter = CIPSO_V4_HDR_LEN;
+ tag = opt + opt_iter;
+ while (opt_iter < opt_len) {
+ for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
+ if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
+ ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+
+ tag_len = tag[1];
+ if (tag_len > (opt_len - opt_iter)) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ switch (tag[0]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ /* We are already going to do all the verification
+ * necessary at the socket layer so from our point of
+ * view it is safe to turn these checks off (and less
+ * work), however, the CIPSO draft says we should do
+ * all the CIPSO validations here but it doesn't
+ * really specify _exactly_ what we need to validate
+ * ... so, just make it a sysctl tunable. */
+ if (cipso_v4_rbm_strictvalid) {
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
+ cipso_v4_map_cat_rbm_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ }
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
+ cipso_v4_map_cat_enum_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
+ cipso_v4_map_cat_rng_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ /* This is a non-standard tag that we only allow for
+ * local connections, so if the incoming interface is
+ * not the loopback device drop the packet. Further,
+ * there is no legitimate reason for setting this from
+ * userspace so reject it if skb is NULL. */
+ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+ if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+ break;
+ default:
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+
+ tag += tag_len;
+ opt_iter += tag_len;
+ }
+
+validate_return_locked:
+ rcu_read_unlock();
+validate_return:
+ *option = opt + err_offset;
+ return err_offset;
+}
+
+/**
+ * cipso_v4_error - Send the correct response for a bad packet
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: CIPSO gateway flag
+ *
+ * Description:
+ * Based on the error code given in @error, send an ICMP error message back to
+ * the originating host. From the IETF draft ...
+ *
+ * "If the contents of the CIPSO [option] are valid but the security label is
+ * outside of the configured host or port label range, the datagram is
+ * discarded and an ICMP 'destination unreachable' (type 3) is generated and
+ * returned. The code field of the ICMP is set to 'communication with
+ * destination network administratively prohibited' (code 9) or to
+ * 'communication with destination host administratively prohibited'
+ * (code 10). The value of the code is dependent on whether the originator
+ * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The
+ * recipient of the ICMP message MUST be able to handle either value. The
+ * same procedure is performed if a CIPSO [option] can not be added to an
+ * IP packet because it is too large to fit in the IP options area."
+ *
+ * "If the error is triggered by receipt of an ICMP message, the message is
+ * discarded and no response is permitted (consistent with general ICMP
+ * processing rules)."
+ *
+ */
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
+{
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
+ return;
+
+ if (gateway)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+}
+
+/**
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function. Returns the length of the option on success and
+ * negative values on failure.
+ *
+ */
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ if (buf_len <= CIPSO_V4_HDR_LEN)
+ return -ENOSPC;
+
+ /* XXX - This code assumes only one tag per CIPSO option which isn't
+ * really a good assumption to make but since we only support the MAC
+ * tags right now it is a safe assumption. */
+ iter = 0;
+ do {
+ memset(buf, 0, buf_len);
+ switch (doi_def->tags[iter]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ ret_val = cipso_v4_gentag_rbm(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_gentag_enum(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_gentag_rng(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_gentag_loc(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ default:
+ return -EPERM;
+ }
+
+ iter++;
+ } while (ret_val < 0 &&
+ iter < CIPSO_V4_TAG_MAXCNT &&
+ doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
+ if (ret_val < 0)
+ return ret_val;
+ cipso_v4_gentag_hdr(doi_def, buf, ret_val);
+ return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *old, *opt = NULL;
+ struct inet_sock *sk_inet;
+ struct inet_connection_sock *sk_conn;
+
+ /* In the case of sock_create_lite(), the sock->sk field is not
+ * defined yet but it is not a problem as the only users of these
+ * "lite" PF_INET sockets are functions which do an accept() call
+ * afterwards so we will label the socket as part of the accept(). */
+ if (!sk)
+ return 0;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto socket_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (!opt) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ sk_inet = inet_sk(sk);
+
+ old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+ if (sk_inet->is_icsk) {
+ sk_conn = inet_csk(sk);
+ if (old)
+ sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+ sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+ rcu_assign_pointer(sk_inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
+
+ return 0;
+
+socket_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *opt = NULL;
+ struct inet_request_sock *req_inet;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto req_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (!opt) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ req_inet = inet_rsk(req);
+ opt = xchg(&req_inet->opt, opt);
+ if (opt)
+ kfree_rcu(opt, rcu);
+
+ return 0;
+
+req_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+{
+ int hdr_delta = 0;
+ struct ip_options_rcu *opt = *opt_ptr;
+
+ if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+ u8 cipso_len;
+ u8 cipso_off;
+ unsigned char *cipso_ptr;
+ int iter;
+ int optlen_new;
+
+ cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+ cipso_ptr = &opt->opt.__data[cipso_off];
+ cipso_len = cipso_ptr[1];
+
+ if (opt->opt.srr > opt->opt.cipso)
+ opt->opt.srr -= cipso_len;
+ if (opt->opt.rr > opt->opt.cipso)
+ opt->opt.rr -= cipso_len;
+ if (opt->opt.ts > opt->opt.cipso)
+ opt->opt.ts -= cipso_len;
+ if (opt->opt.router_alert > opt->opt.cipso)
+ opt->opt.router_alert -= cipso_len;
+ opt->opt.cipso = 0;
+
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ opt->opt.optlen - cipso_off - cipso_len);
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length */
+ iter = 0;
+ optlen_new = 0;
+ while (iter < opt->opt.optlen)
+ if (opt->opt.__data[iter] != IPOPT_NOP) {
+ iter += opt->opt.__data[iter + 1];
+ optlen_new = iter;
+ } else
+ iter++;
+ hdr_delta = opt->opt.optlen;
+ opt->opt.optlen = (optlen_new + 3) & ~3;
+ hdr_delta -= opt->opt.optlen;
+ } else {
+ /* only the cipso option was present on the socket so we can
+ * remove the entire option struct */
+ *opt_ptr = NULL;
+ hdr_delta = opt->opt.optlen;
+ kfree_rcu(opt, rcu);
+ }
+
+ return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+ int hdr_delta;
+ struct ip_options_rcu *opt;
+ struct inet_sock *sk_inet;
+
+ sk_inet = inet_sk(sk);
+ opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+ if (!opt || opt->opt.cipso == 0)
+ return;
+
+ hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+ if (sk_inet->is_icsk && hdr_delta > 0) {
+ struct inet_connection_sock *sk_conn = inet_csk(sk);
+ sk_conn->icsk_ext_hdr_len -= hdr_delta;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+ struct ip_options_rcu *opt;
+ struct inet_request_sock *req_inet;
+
+ req_inet = inet_rsk(req);
+ opt = req_inet->opt;
+ if (!opt || opt->opt.cipso == 0)
+ return;
+
+ cipso_v4_delopt(&req_inet->opt);
+}
+
+/**
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @cipso and return the security attributes in @secattr. Returns zero
+ * on success and negative values on failure.
+ *
+ */
+int cipso_v4_getattr(const unsigned char *cipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ u32 doi;
+ struct cipso_v4_doi *doi_def;
+
+ if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+ return 0;
+
+ doi = get_unaligned_be32(&cipso[2]);
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def)
+ goto getattr_return;
+ /* XXX - This code assumes only one tag per CIPSO option which isn't
+ * really a good assumption to make but since we only support the MAC
+ * tags right now it is a safe assumption. */
+ switch (cipso[6]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+ break;
+ }
+ if (ret_val == 0)
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+
+getattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ struct ip_options_rcu *opt;
+ int res = -ENOMSG;
+
+ rcu_read_lock();
+ opt = rcu_dereference(inet_sk(sk)->inet_opt);
+ if (opt && opt->opt.cipso)
+ res = cipso_v4_getattr(opt->opt.__data +
+ opt->opt.cipso -
+ sizeof(struct iphdr),
+ secattr);
+ rcu_read_unlock();
+ return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+ u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+ u32 opt_len;
+ int len_delta;
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+ buf_len = ret_val;
+ opt_len = (buf_len + 3) & ~3;
+
+ /* we overwrite any existing options to ensure that we have enough
+ * room for the CIPSO option, the reason is that we _need_ to guarantee
+ * that the security label is applied to the packet - we do the same
+ * thing when using the socket options and it hasn't caused a problem,
+ * if we need to we can always revisit this choice later */
+
+ len_delta = opt_len - opt->optlen;
+ /* if we don't ensure enough headroom we could panic on the skb_push()
+ * call below so make sure we have enough, we are also "mangling" the
+ * packet so we should probably do a copy-on-write call anyway */
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta > 0) {
+ /* we assume that the header + opt->optlen have already been
+ * "pushed" in ip_options_build() or similar */
+ iph = ip_hdr(skb);
+ skb_push(skb, len_delta);
+ memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ } else if (len_delta < 0) {
+ iph = ip_hdr(skb);
+ memset(iph + 1, IPOPT_NOP, opt->optlen);
+ } else
+ iph = ip_hdr(skb);
+
+ if (opt->optlen > 0)
+ memset(opt, 0, sizeof(*opt));
+ opt->optlen = opt_len;
+ opt->cipso = sizeof(struct iphdr);
+ opt->is_changed = 1;
+
+ /* we have to do the following because we are being called from a
+ * netfilter hook which means the packet already has had the header
+ * fields populated and the checksum calculated - yes this means we
+ * are doing more work than needed but we do it to keep the core
+ * stack clean and tidy */
+ memcpy(iph + 1, buf, buf_len);
+ if (opt_len > buf_len)
+ memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+ if (len_delta != 0) {
+ iph->ihl = 5 + (opt_len >> 2);
+ iph->tot_len = htons(skb->len);
+ }
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char *cipso_ptr;
+
+ if (opt->cipso == 0)
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ /* the easiest thing to do is just replace the cipso option with noop
+ * options since we don't change the size of the packet, although we
+ * still need to recalculate the checksum */
+
+ iph = ip_hdr(skb);
+ cipso_ptr = (unsigned char *)iph + opt->cipso;
+ memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+ opt->cipso = 0;
+ opt->is_changed = 1;
+
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * cipso_v4_init - Initialize the CIPSO module
+ *
+ * Description:
+ * Initialize the CIPSO module and prepare it for use. Returns zero on success
+ * and negative values on failure.
+ *
+ */
+static int __init cipso_v4_init(void)
+{
+ int ret_val;
+
+ ret_val = cipso_v4_cache_init();
+ if (ret_val != 0)
+ panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
+ ret_val);
+
+ return 0;
+}
+
+subsys_initcall(cipso_v4_init);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 000000000..90c0e8386
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,123 @@
+/*
+ * common UDP/RAW code
+ * Linux INET implementation
+ *
+ * Authors:
+ * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 saddr;
+ int oif;
+ int err;
+
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ sk_dst_reset(sk);
+
+ lock_sock(sk);
+
+ oif = sk->sk_bound_dev_if;
+ saddr = inet->inet_saddr;
+ if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
+ if (!oif)
+ oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ }
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->inet_sport, usin->sin_port, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+ ip_rt_put(rt);
+ err = -EACCES;
+ goto out;
+ }
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr; /* Update source address */
+ if (!inet->inet_rcv_saddr) {
+ inet->inet_rcv_saddr = fl4->saddr;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ inet_set_txhash(sk);
+ inet->inet_id = jiffies;
+
+ sk_dst_set(sk, &rt->dst);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(ip4_datagram_connect);
+
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
+void ip4_datagram_release_cb(struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ rcu_read_unlock();
+ return;
+ }
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+
+ dst = !IS_ERR(rt) ? &rt->dst : NULL;
+ sk_dst_set(sk, dst);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 000000000..419d23c53
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,2386 @@
+/*
+ * NET3 IP device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the IP parts of dev.c 1.0.19
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Alexey Kuznetsov: pa_* fields are replaced with ifaddr
+ * lists.
+ * Cyrus Durgin: updated for kmod
+ * Matthias Andree: in devinet_ioctl, compare label and
+ * address (4.4BSD alias style support),
+ * fall back to comparing just the label
+ * if no match found.
+ */
+
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_addr.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+#include <linux/netconf.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/addrconf.h>
+
+#include "fib_lookup.h"
+
+static struct ipv4_devconf ipv4_devconf = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ },
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ },
+};
+
+#define IPV4_DEVCONF_DFLT(net, attr) \
+ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
+
+static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
+ [IFA_LOCAL] = { .type = NLA_U32 },
+ [IFA_ADDRESS] = { .type = NLA_U32 },
+ [IFA_BROADCAST] = { .type = NLA_U32 },
+ [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .type = NLA_U32 },
+};
+
+#define IN4_ADDR_HSIZE_SHIFT 8
+#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
+
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+
+static u32 inet_addr_hash(const struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ net_hash_mix(net);
+
+ return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ u32 hash = inet_addr_hash(net, ifa->ifa_local);
+
+ ASSERT_RTNL();
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ ASSERT_RTNL();
+ hlist_del_init_rcu(&ifa->hash);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ u32 hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
+ if (ifa->ifa_local == addr) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ result = dev;
+ break;
+ }
+ }
+ if (!result) {
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res = { 0 };
+ struct fib_table *local;
+
+ /* Fallback to FIB local table so that communication
+ * over loopback subnets work.
+ */
+ local = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local &&
+ !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+ res.type == RTN_LOCAL)
+ result = FIB_RES_DEV(res);
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
+static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
+
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy);
+#ifdef CONFIG_SYSCTL
+static int devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static int devinet_sysctl_register(struct in_device *idev)
+{
+ return 0;
+}
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+ return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+ struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+ if (ifa->ifa_dev)
+ in_dev_put(ifa->ifa_dev);
+ kfree(ifa);
+}
+
+static void inet_free_ifa(struct in_ifaddr *ifa)
+{
+ call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ WARN_ON(idev->ifa_list);
+ WARN_ON(idev->mc_list);
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
+#ifdef NET_REFCNT_DEBUG
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead)
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
+ kfree(idev);
+}
+EXPORT_SYMBOL(in_dev_finish_destroy);
+
+static struct in_device *inetdev_init(struct net_device *dev)
+{
+ struct in_device *in_dev;
+ int err = -ENOMEM;
+
+ ASSERT_RTNL();
+
+ in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
+ if (!in_dev)
+ goto out;
+ memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+ sizeof(in_dev->cnf));
+ in_dev->cnf.sysctl = NULL;
+ in_dev->dev = dev;
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
+ goto out_kfree;
+ if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+ dev_disable_lro(dev);
+ /* Reference in_dev->dev */
+ dev_hold(dev);
+ /* Account for reference dev->ip_ptr (below) */
+ in_dev_hold(in_dev);
+
+ err = devinet_sysctl_register(in_dev);
+ if (err) {
+ in_dev->dead = 1;
+ in_dev_put(in_dev);
+ in_dev = NULL;
+ goto out;
+ }
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
+
+ /* we can receive as soon as ip_ptr is set -- do this last */
+ rcu_assign_pointer(dev->ip_ptr, in_dev);
+out:
+ return in_dev ?: ERR_PTR(err);
+out_kfree:
+ kfree(in_dev);
+ in_dev = NULL;
+ goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+ in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ dev = in_dev->dev;
+
+ in_dev->dead = 1;
+
+ ip_mc_destroy_dev(in_dev);
+
+ while ((ifa = in_dev->ifa_list) != NULL) {
+ inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+ inet_free_ifa(ifa);
+ }
+
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
+
+ devinet_sysctl_unregister(in_dev);
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ arp_ifdown(dev);
+
+ call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
+{
+ rcu_read_lock();
+ for_primary_ifa(in_dev) {
+ if (inet_ifa_match(a, ifa)) {
+ if (!b || inet_ifa_match(b, ifa)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ } endfor_ifa(in_dev);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 portid)
+{
+ struct in_ifaddr *promote = NULL;
+ struct in_ifaddr *ifa, *ifa1 = *ifap;
+ struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *prev_prom = NULL;
+ int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
+
+ ASSERT_RTNL();
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries
+ * unless alias promotion is set
+ **/
+
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa = *ifap1) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ ifa1->ifa_scope <= ifa->ifa_scope)
+ last_prim = ifa;
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa)) {
+ ifap1 = &ifa->ifa_next;
+ prev_prom = ifa;
+ continue;
+ }
+
+ if (!do_promote) {
+ inet_hash_remove(ifa);
+ *ifap1 = ifa->ifa_next;
+
+ rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain,
+ NETDEV_DOWN, ifa);
+ inet_free_ifa(ifa);
+ } else {
+ promote = ifa;
+ break;
+ }
+ }
+ }
+
+ /* On promotion all secondaries from subnet are changing
+ * the primary IP, we must remove all their routes silently
+ * and later to add them back with new prefsrc. Do this
+ * while all addresses are on the device list.
+ */
+ for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa))
+ fib_del_ifaddr(ifa, ifa1);
+ }
+
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
+
+ /* 3. Announce address deletion */
+
+ /* Send message first, then call notifier.
+ At first sight, FIB update triggered by notifier
+ will refer to already deleted ifaddr, that could confuse
+ netlink listeners. It is not true: look, gated sees
+ that route deleted and if it still thinks that ifaddr
+ is valid, it will try to restore deleted routes... Grr.
+ So that, this order is correct.
+ */
+ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+
+ if (promote) {
+ struct in_ifaddr *next_sec = promote->ifa_next;
+
+ if (prev_prom) {
+ prev_prom->ifa_next = promote->ifa_next;
+ promote->ifa_next = last_prim->ifa_next;
+ last_prim->ifa_next = promote;
+ }
+
+ promote->ifa_flags &= ~IFA_F_SECONDARY;
+ rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain,
+ NETDEV_UP, promote);
+ for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa))
+ continue;
+ fib_add_ifaddr(ifa);
+ }
+
+ }
+ if (destroy)
+ inet_free_ifa(ifa1);
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy)
+{
+ __inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
+}
+
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
+static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 portid)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+ ASSERT_RTNL();
+
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ last_primary = &in_dev->ifa_list;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+ ifa->ifa_scope <= ifa1->ifa_scope)
+ last_primary = &ifa1->ifa_next;
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa)) {
+ if (ifa1->ifa_local == ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return -EEXIST;
+ }
+ if (ifa1->ifa_scope != ifa->ifa_scope) {
+ inet_free_ifa(ifa);
+ return -EINVAL;
+ }
+ ifa->ifa_flags |= IFA_F_SECONDARY;
+ }
+ }
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+ prandom_seed((__force u32) ifa->ifa_local);
+ ifap = last_primary;
+ }
+
+ ifa->ifa_next = *ifap;
+ *ifap = ifa;
+
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+ /* Send message first, then call notifier.
+ Notifier will trigger FIB update, so that
+ listeners of netlink will know about new ifaddr */
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+ return __inet_insert_ifa(ifa, NULL, 0);
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ inet_free_ifa(ifa);
+ return -ENOBUFS;
+ }
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ if (ifa->ifa_dev != in_dev) {
+ WARN_ON(ifa->ifa_dev);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ }
+ if (ipv4_is_loopback(ifa->ifa_local))
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ return inet_insert_ifa(ifa);
+}
+
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct in_device *in_dev = NULL;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ rcu_read_unlock();
+ return in_dev;
+}
+EXPORT_SYMBOL(inetdev_by_index);
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
+ __be32 mask)
+{
+ ASSERT_RTNL();
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+ return ifa;
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
+{
+ struct ip_mreqn mreq = {
+ .imr_multiaddr.s_addr = ifa->ifa_address,
+ .imr_ifindex = ifa->ifa_dev->dev->ifindex,
+ };
+ int ret;
+
+ ASSERT_RTNL();
+
+ lock_sock(sk);
+ if (join)
+ ret = ip_mc_join_group(sk, &mreq);
+ else
+ ret = ip_mc_leave_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm;
+ struct in_ifaddr *ifa, **ifap;
+ int err = -EINVAL;
+
+ ASSERT_RTNL();
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ ifm = nlmsg_data(nlh);
+ in_dev = inetdev_by_index(net, ifm->ifa_index);
+ if (!in_dev) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (tb[IFA_LOCAL] &&
+ ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
+ continue;
+
+ if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+ continue;
+
+ if (tb[IFA_ADDRESS] &&
+ (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+ !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
+ continue;
+
+ if (ipv4_is_multicast(ifa->ifa_address))
+ ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
+ __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
+ return 0;
+ }
+
+ err = -EADDRNOTAVAIL;
+errout:
+ return err;
+}
+
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct in_ifaddr *ifa;
+ struct hlist_node *n;
+ int i;
+
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ bool change_needed = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ change_needed = true;
+ } else if (ifa->ifa_preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= ifa->ifa_preferred_lft) {
+ if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ, next))
+ next = ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ;
+
+ if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+ change_needed = true;
+ } else if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ,
+ next)) {
+ next = ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ;
+ }
+ }
+ rcu_read_unlock();
+ if (!change_needed)
+ continue;
+ rtnl_lock();
+ hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr **ifap;
+
+ for (ifap = &ifa->ifa_dev->ifa_list;
+ *ifap != NULL; ifap = &(*ifap)->ifa_next) {
+ if (*ifap == ifa) {
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ break;
+ }
+ }
+ } else if (ifa->ifa_preferred_lft !=
+ INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_preferred_lft &&
+ !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ }
+ rtnl_unlock();
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
+ next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+
+ ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ ifa->ifa_valid_lft = timeout;
+ else
+ ifa->ifa_flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ ifa->ifa_preferred_lft = timeout;
+ }
+ ifa->ifa_tstamp = jiffies;
+ if (!ifa->ifa_cstamp)
+ ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+ __u32 *pvalid_lft, __u32 *pprefered_lft)
+{
+ struct nlattr *tb[IFA_MAX+1];
+ struct in_ifaddr *ifa;
+ struct ifaddrmsg *ifm;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ ifm = nlmsg_data(nlh);
+ err = -EINVAL;
+ if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL])
+ goto errout;
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ err = -ENODEV;
+ if (!dev)
+ goto errout;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ err = -ENOBUFS;
+ if (!in_dev)
+ goto errout;
+
+ ifa = inet_alloc_ifa();
+ if (!ifa)
+ /*
+ * A potential indev allocation can be left alive, it stays
+ * assigned to its device and is destroy with it.
+ */
+ goto errout;
+
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ in_dev_hold(in_dev);
+
+ if (!tb[IFA_ADDRESS])
+ tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+ INIT_HLIST_NODE(&ifa->hash);
+ ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+ ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+ ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+ ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ ifa->ifa_dev = in_dev;
+
+ ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
+ ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
+
+ if (tb[IFA_BROADCAST])
+ ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
+
+ if (tb[IFA_LABEL])
+ nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ err = -EINVAL;
+ goto errout_free;
+ }
+ *pvalid_lft = ci->ifa_valid;
+ *pprefered_lft = ci->ifa_prefered;
+ }
+
+ return ifa;
+
+errout_free:
+ inet_free_ifa(ifa);
+errout:
+ return ERR_PTR(err);
+}
+
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap;
+
+ if (!ifa->ifa_local)
+ return NULL;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+ return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct in_ifaddr *ifa;
+ struct in_ifaddr *ifa_existing;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
+
+ ASSERT_RTNL();
+
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
+ if (IS_ERR(ifa))
+ return PTR_ERR(ifa);
+
+ ifa_existing = find_matching_ifa(ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace already relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
+ int ret = ip_mc_config(net->ipv4.mc_autojoin_sk,
+ true, ifa);
+
+ if (ret < 0) {
+ inet_free_ifa(ifa);
+ return ret;
+ }
+ }
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ } else {
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+ ifa = ifa_existing;
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq,
+ &check_lifetime_work, 0);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+ }
+ return 0;
+}
+
+/*
+ * Determine a default network mask, based on the IP address.
+ */
+
+static int inet_abc_len(__be32 addr)
+{
+ int rc = -1; /* Something else, probably a multicast. */
+
+ if (ipv4_is_zeronet(addr))
+ rc = 0;
+ else {
+ __u32 haddr = ntohl(addr);
+
+ if (IN_CLASSA(haddr))
+ rc = 8;
+ else if (IN_CLASSB(haddr))
+ rc = 16;
+ else if (IN_CLASSC(haddr))
+ rc = 24;
+ }
+
+ return rc;
+}
+
+
+int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ struct sockaddr_in sin_orig;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct in_device *in_dev;
+ struct in_ifaddr **ifap = NULL;
+ struct in_ifaddr *ifa = NULL;
+ struct net_device *dev;
+ char *colon;
+ int ret = -EFAULT;
+ int tryaddrmatch = 0;
+
+ /*
+ * Fetch the caller's info block into kernel space
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ goto out;
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+ /* save original address for comparison */
+ memcpy(&sin_orig, sin, sizeof(*sin));
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ dev_load(net, ifr.ifr_name);
+
+ switch (cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ /* Note that these ioctls will not sleep,
+ so that we do not impose a lock.
+ One day we will be forced to put shlock here (I mean SMP)
+ */
+ tryaddrmatch = (sin_orig.sin_family == AF_INET);
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ break;
+
+ case SIOCSIFFLAGS:
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ ret = -EINVAL;
+ if (sin->sin_family != AF_INET)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rtnl_lock();
+
+ ret = -ENODEV;
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
+ goto done;
+
+ if (colon)
+ *colon = ':';
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ if (tryaddrmatch) {
+ /* Matthias Andree */
+ /* compare label and address (4.4BSD style) */
+ /* note: we only do this for a limited set of ioctls
+ and only if the original address family was AF_INET.
+ This is checked above. */
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+ sin_orig.sin_addr.s_addr ==
+ ifa->ifa_local) {
+ break; /* found */
+ }
+ }
+ }
+ /* we didn't get a match, maybe the application is
+ 4.3BSD-style and passed in junk so we fall back to
+ comparing just the label */
+ if (!ifa) {
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next)
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+ break;
+ }
+ }
+
+ ret = -EADDRNOTAVAIL;
+ if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+ goto done;
+
+ switch (cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ sin->sin_addr.s_addr = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ sin->sin_addr.s_addr = ifa->ifa_broadcast;
+ goto rarok;
+
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ sin->sin_addr.s_addr = ifa->ifa_address;
+ goto rarok;
+
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ sin->sin_addr.s_addr = ifa->ifa_mask;
+ goto rarok;
+
+ case SIOCSIFFLAGS:
+ if (colon) {
+ ret = -EADDRNOTAVAIL;
+ if (!ifa)
+ break;
+ ret = 0;
+ if (!(ifr.ifr_flags & IFF_UP))
+ inet_del_ifa(in_dev, ifap, 1);
+ break;
+ }
+ ret = dev_change_flags(dev, ifr.ifr_flags);
+ break;
+
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+
+ if (!ifa) {
+ ret = -ENOBUFS;
+ ifa = inet_alloc_ifa();
+ if (!ifa)
+ break;
+ INIT_HLIST_NODE(&ifa->hash);
+ if (colon)
+ memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ ret = 0;
+ if (ifa->ifa_local == sin->sin_addr.s_addr)
+ break;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = 0;
+ ifa->ifa_scope = 0;
+ }
+
+ ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+ if (!(dev->flags & IFF_POINTOPOINT)) {
+ ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ if ((dev->flags & IFF_BROADCAST) &&
+ ifa->ifa_prefixlen < 31)
+ ifa->ifa_broadcast = ifa->ifa_address |
+ ~ifa->ifa_mask;
+ } else {
+ ifa->ifa_prefixlen = 32;
+ ifa->ifa_mask = inet_make_mask(32);
+ }
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ ret = inet_set_ifa(dev, ifa);
+ break;
+
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ ret = 0;
+ if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ }
+ break;
+
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ ret = 0;
+ if (ifa->ifa_address == sin->sin_addr.s_addr)
+ break;
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+ ret = 0;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_address = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ break;
+
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+
+ /*
+ * The mask we set must be legal.
+ */
+ ret = -EINVAL;
+ if (bad_mask(sin->sin_addr.s_addr, 0))
+ break;
+ ret = 0;
+ if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ __be32 old_mask = ifa->ifa_mask;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_mask = sin->sin_addr.s_addr;
+ ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+ /* See if current broadcast address matches
+ * with current netmask, then recalculate
+ * the broadcast address. Otherwise it's a
+ * funny address, so don't touch it since
+ * the user seems to know what (s)he's doing...
+ */
+ if ((dev->flags & IFF_BROADCAST) &&
+ (ifa->ifa_prefixlen < 31) &&
+ (ifa->ifa_broadcast ==
+ (ifa->ifa_local|~old_mask))) {
+ ifa->ifa_broadcast = (ifa->ifa_local |
+ ~sin->sin_addr.s_addr);
+ }
+ inet_insert_ifa(ifa);
+ }
+ break;
+ }
+done:
+ rtnl_unlock();
+out:
+ return ret;
+rarok:
+ rtnl_unlock();
+ ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+ goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_ifaddr *ifa;
+ struct ifreq ifr;
+ int done = 0;
+
+ if (!in_dev)
+ goto out;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (!buf) {
+ done += sizeof(ifr);
+ continue;
+ }
+ if (len < (int) sizeof(ifr))
+ break;
+ memset(&ifr, 0, sizeof(struct ifreq));
+ strcpy(ifr.ifr_name, ifa->ifa_label);
+
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+ ifa->ifa_local;
+
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+ done = -EFAULT;
+ break;
+ }
+ buf += sizeof(struct ifreq);
+ len -= sizeof(struct ifreq);
+ done += sizeof(struct ifreq);
+ }
+out:
+ return done;
+}
+
+__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
+{
+ __be32 addr = 0;
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto no_in_dev;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!dst || inet_ifa_match(dst, ifa)) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ if (!addr)
+ addr = ifa->ifa_local;
+ } endfor_ifa(in_dev);
+
+ if (addr)
+ goto out_unlock;
+no_in_dev:
+
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is important that lo is the first interface
+ in dev_base list.
+ */
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock;
+ }
+ } endfor_ifa(in_dev);
+ }
+out_unlock:
+ rcu_read_unlock();
+ return addr;
+}
+EXPORT_SYMBOL(inet_select_addr);
+
+static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
+ __be32 local, int scope)
+{
+ int same = 0;
+ __be32 addr = 0;
+
+ for_ifa(in_dev) {
+ if (!addr &&
+ (local == ifa->ifa_local || !local) &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ if (same)
+ break;
+ }
+ if (!same) {
+ same = (!local || inet_ifa_match(local, ifa)) &&
+ (!dst || inet_ifa_match(dst, ifa));
+ if (same && addr) {
+ if (local || !dst)
+ break;
+ /* Is the selected addr into dst subnet? */
+ if (inet_ifa_match(addr, ifa))
+ break;
+ /* No, then can we use new local src? */
+ if (ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ /* search for large dst subnet for addr */
+ same = 0;
+ }
+ }
+ } endfor_ifa(in_dev);
+
+ return same ? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
+ __be32 dst, __be32 local, int scope)
+{
+ __be32 addr = 0;
+ struct net_device *dev;
+
+ if (in_dev)
+ return confirm_addr_indev(in_dev, dst, local, scope);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ if (addr)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return addr;
+}
+EXPORT_SYMBOL(inet_confirm_addr);
+
+/*
+ * Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_notifier);
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ int named = 0;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ char old[IFNAMSIZ], *dot;
+
+ memcpy(old, ifa->ifa_label, IFNAMSIZ);
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (named++ == 0)
+ goto skip;
+ dot = strchr(old, ':');
+ if (!dot) {
+ sprintf(old, ":%d", named);
+ dot = old;
+ }
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
+ strcat(ifa->ifa_label, dot);
+ else
+ strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+skip:
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+}
+
+static bool inetdev_valid_mtu(unsigned int mtu)
+{
+ return mtu >= 68;
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa;
+
+ for (ifa = in_dev->ifa_list; ifa;
+ ifa = ifa->ifa_next) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
+ }
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ if (event == NETDEV_REGISTER) {
+ in_dev = inetdev_init(dev);
+ if (IS_ERR(in_dev))
+ return notifier_from_errno(PTR_ERR(in_dev));
+ if (dev->flags & IFF_LOOPBACK) {
+ IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
+ IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
+ }
+ } else if (event == NETDEV_CHANGEMTU) {
+ /* Re-enabling IP */
+ if (inetdev_valid_mtu(dev->mtu))
+ in_dev = inetdev_init(dev);
+ }
+ goto out;
+ }
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ pr_debug("%s: bug\n", __func__);
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
+ break;
+ case NETDEV_UP:
+ if (!inetdev_valid_mtu(dev->mtu))
+ break;
+ if (dev->flags & IFF_LOOPBACK) {
+ struct in_ifaddr *ifa = inet_alloc_ifa();
+
+ if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
+ ifa->ifa_local =
+ ifa->ifa_address = htonl(INADDR_LOOPBACK);
+ ifa->ifa_prefixlen = 8;
+ ifa->ifa_mask = inet_make_mask(8);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ inet_insert_ifa(ifa);
+ }
+ }
+ ip_mc_up(in_dev);
+ /* fall through */
+ case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
+ /* Send gratuitous ARP to notify of link change */
+ inetdev_send_gratuitous_arp(dev, in_dev);
+ break;
+ case NETDEV_DOWN:
+ ip_mc_down(in_dev);
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ ip_mc_unmap(in_dev);
+ break;
+ case NETDEV_POST_TYPE_CHANGE:
+ ip_mc_remap(in_dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (inetdev_valid_mtu(dev->mtu))
+ break;
+ /* disable IP when MTU is not enough */
+ case NETDEV_UNREGISTER:
+ inetdev_destroy(in_dev);
+ break;
+ case NETDEV_CHANGENAME:
+ /* Do not notify about label change, this event is
+ * not interesting to applications using netlink.
+ */
+ inetdev_changename(dev, in_dev);
+
+ devinet_sysctl_unregister(in_dev);
+ devinet_sysctl_register(in_dev);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+ .notifier_call = inetdev_event,
+};
+
+static size_t inet_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(4) /* IFA_ADDRESS */
+ + nla_total_size(4) /* IFA_LOCAL */
+ + nla_total_size(4) /* IFA_BROADCAST */
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
+}
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+ u32 portid, u32 seq, int event, unsigned int flags)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+ u32 preferred, valid;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+ ifm->ifa_flags = ifa->ifa_flags;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+ if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ preferred = ifa->ifa_preferred_lft;
+ valid = ifa->ifa_valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->ifa_tstamp) / HZ;
+
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
+ if ((ifa->ifa_address &&
+ nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ preferred, valid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ int ip_idx, s_ip_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ s_ip_idx = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR, NLM_F_MULTI) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
+
+ return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 portid)
+{
+ struct sk_buff *skb;
+ u32 seq = nlh ? nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+ struct net *net;
+
+ net = dev_net(ifa->ifa_dev->dev);
+ skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+}
+
+static size_t inet_get_link_af_size(const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+ if (!in_dev)
+ return 0;
+
+ return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ struct nlattr *nla;
+ int i;
+
+ if (!in_dev)
+ return -ENODATA;
+
+ nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+ if (!nla)
+ return -EMSGSIZE;
+
+ for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+ ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+
+ return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+ [IFLA_INET_CONF] = { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla)
+{
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int err, rem;
+
+ if (dev && !__in_dev_get_rtnl(dev))
+ return -EAFNOSUPPORT;
+
+ err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+ int cfgid = nla_type(a);
+
+ if (nla_len(a) < 4)
+ return -EINVAL;
+
+ if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int rem;
+
+ if (!in_dev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+ BUG();
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+ ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+ }
+
+ return 0;
+}
+
+static int inet_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+
+ /* type -1 is used for ALL */
+ if (type == -1 || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_RP_FILTER)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv4_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ /* type -1 is used for ALL */
+ if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ nla_put_s32(skb, NETCONFA_RP_FILTER,
+ IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
+ struct ipv4_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ RTM_NEWNETCONF, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+};
+
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct netconfmsg *ncm;
+ struct sk_buff *skb;
+ struct ipv4_devconf *devconf;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ int ifindex;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ err = EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv4.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv4.devconf_dflt;
+ break;
+ default:
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ goto errout;
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ goto errout;
+ devconf = &in_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ -1);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ if (inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) < 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) < 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && !test_bit(i, in_dev->cnf.state))
+ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+ }
+ rcu_read_unlock();
+}
+
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+ struct net_device *dev;
+ int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+ IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev;
+ if (on)
+ dev_disable_lro(dev);
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
+ IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
+{
+ if (cnf == net->ipv4.devconf_dflt)
+ return NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ return NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev
+ = container_of(cnf, struct in_device, cnf);
+ return idev->dev->ifindex;
+ }
+}
+
+static int devinet_conf_proc(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct net *net = ctl->extra2;
+ int i = (int *)ctl->data - cnf->data;
+ int ifindex;
+
+ set_bit(i, cnf->state);
+
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+ i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
+ if ((new_value == 0) && (old_value != 0))
+ rt_cache_flush(net);
+
+ if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+ ifindex, cnf);
+ }
+ if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ ifindex, cnf);
+ }
+ }
+
+ return ret;
+}
+
+static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+ struct net *net = ctl->extra2;
+
+ if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+ if (!rtnl_trylock()) {
+ /* Restore the original values before restarting */
+ *valp = val;
+ *ppos = pos;
+ return restart_syscall();
+ }
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+ inet_forward_change(net);
+ } else {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct in_device *idev =
+ container_of(cnf, struct in_device, cnf);
+ if (*valp)
+ dev_disable_lro(idev->dev);
+ inet_netconf_notify_devconf(net,
+ NETCONFA_FORWARDING,
+ idev->dev->ifindex,
+ cnf);
+ }
+ rtnl_unlock();
+ rt_cache_flush(net);
+ } else
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+ }
+
+ return ret;
+}
+
+static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
+
+ if (write && *valp != val)
+ rt_cache_flush(net);
+
+ return ret;
+}
+
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
+ { \
+ .procname = name, \
+ .data = ipv4_devconf.data + \
+ IPV4_DEVCONF_ ## attr - 1, \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ .extra1 = &ipv4_devconf, \
+ }
+
+#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
+
+#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
+
+static struct devinet_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+} devinet_sysctl = {
+ .devinet_vars = {
+ DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+ devinet_sysctl_forward),
+ DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+ DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+ "accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+ DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+ DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+ DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+ DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+ DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+ DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv2_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv3_unsolicited_report_interval"),
+
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+ "promote_secondaries"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+ "route_localnet"),
+ },
+};
+
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+ struct ipv4_devconf *p)
+{
+ int i;
+ struct devinet_sysctl_table *t;
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
+
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out;
+
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+ t->devinet_vars[i].extra1 = p;
+ t->devinet_vars[i].extra2 = net;
+ }
+
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
+
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
+ if (!t->sysctl_header)
+ goto free;
+
+ p->sysctl = t;
+ return 0;
+
+free:
+ kfree(t);
+out:
+ return -ENOBUFS;
+}
+
+static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
+{
+ struct devinet_sysctl_table *t = cnf->sysctl;
+
+ if (!t)
+ return;
+
+ cnf->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+}
+
+static int devinet_sysctl_register(struct in_device *idev)
+{
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+ if (err)
+ return err;
+ err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ &idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->arp_parms);
+ return err;
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+ __devinet_sysctl_unregister(&idev->cnf);
+ neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+ {
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.data[
+ IPV4_DEVCONF_FORWARDING - 1],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = devinet_sysctl_forward,
+ .extra1 = &ipv4_devconf,
+ .extra2 = &init_net,
+ },
+ { },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
+{
+ int err;
+ struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl = ctl_forward_entry;
+ struct ctl_table_header *forw_hdr;
+#endif
+
+ err = -ENOMEM;
+ all = &ipv4_devconf;
+ dflt = &ipv4_devconf_dflt;
+
+ if (!net_eq(net, &init_net)) {
+ all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
+
+ dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+ tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (!tbl)
+ goto err_alloc_ctl;
+
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
+#endif
+ }
+
+#ifdef CONFIG_SYSCTL
+ err = __devinet_sysctl_register(net, "all", all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __devinet_sysctl_register(net, "default", dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+
+ err = -ENOMEM;
+ forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
+ if (!forw_hdr)
+ goto err_reg_ctl;
+ net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+ net->ipv4.devconf_all = all;
+ net->ipv4.devconf_dflt = dflt;
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+ __devinet_sysctl_unregister(dflt);
+err_reg_dflt:
+ __devinet_sysctl_unregister(all);
+err_reg_all:
+ if (tbl != ctl_forward_entry)
+ kfree(tbl);
+err_alloc_ctl:
+#endif
+ if (dflt != &ipv4_devconf_dflt)
+ kfree(dflt);
+err_alloc_dflt:
+ if (all != &ipv4_devconf)
+ kfree(all);
+err_alloc_all:
+ return err;
+}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.forw_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.forw_hdr);
+ __devinet_sysctl_unregister(net->ipv4.devconf_dflt);
+ __devinet_sysctl_unregister(net->ipv4.devconf_all);
+ kfree(tbl);
+#endif
+ kfree(net->ipv4.devconf_dflt);
+ kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+ .init = devinet_init_net,
+ .exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops __read_mostly = {
+ .family = AF_INET,
+ .fill_link_af = inet_fill_link_af,
+ .get_link_af_size = inet_get_link_af_size,
+ .validate_link_af = inet_validate_link_af,
+ .set_link_af = inet_set_link_af,
+};
+
+void __init devinet_init(void)
+{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
+ register_pernet_subsys(&devinet_ops);
+
+ register_gifconf(PF_INET, inet_gifconf);
+ register_netdevice_notifier(&ip_netdev_notifier);
+
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+ rtnl_af_register(&inet_af_ops);
+
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
+ inet_netconf_dump_devconf, NULL);
+}
+
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 000000000..30b544f02
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,731 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/in6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+{
+ unsigned int len;
+
+ len = seqhilen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+ struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_givcrypt_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_givcrypt_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+ struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct aead_givcrypt_request *req;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ struct sk_buff *trailer;
+ void *tmp;
+ u8 *iv;
+ u8 *tail;
+ int blksize;
+ int clen;
+ int alen;
+ int plen;
+ int tfclen;
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ plen = clen - skb->len - tfclen;
+
+ err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_givreq(aead, iv);
+ asg = esp_givreq_sg(aead, req);
+ sg = asg + sglists;
+
+ /* Fill padding... */
+ tail = skb_tail_pointer(trailer);
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = *skb_mac_header(skb);
+ pskb_put(skb, trailer, clen - skb->len + alen);
+
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ /* this is non-NULL only with UDP Encapsulation */
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh;
+ __be32 *udpdata32;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ uh = (struct udphdr *)esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(skb->len - skb_transport_offset(skb));
+ uh->check = 0;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = (struct ip_esp_hdr *)(uh + 1);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ break;
+ }
+
+ *skb_mac_header(skb) = IPPROTO_UDP;
+ }
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+ clen + alen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+ aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+ aead_givcrypt_set_assoc(req, asg, assoclen);
+ aead_givcrypt_set_giv(req, esph->enc_data,
+ XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_givencrypt(req);
+ if (err == -EINPROGRESS)
+ goto error;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+
+ kfree(tmp);
+
+error:
+ return err;
+}
+
+static int esp_input_done2(struct sk_buff *skb, int err)
+{
+ const struct iphdr *iph;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct crypto_aead *aead = x->data;
+ int alen = crypto_aead_authsize(aead);
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int elen = skb->len - hlen;
+ int ihl;
+ u8 nexthdr[2];
+ int padlen;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+ BUG();
+
+ err = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen)
+ goto out;
+
+ /* ... check padding bits here. Silly. :-) */
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertize the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (iph->saddr != x->props.saddr.a4 ||
+ uh->source != encap->encap_sport) {
+ xfrm_address_t ipaddr;
+
+ ipaddr.a4 = iph->saddr;
+ km_new_mapping(x, &ipaddr, uh->source);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ pskb_trim(skb, skb->len - alen - padlen - 2);
+ __skb_pull(skb, hlen);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr[1];
+
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
+
+out:
+ return err;
+}
+
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead = x->data;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+ void *tmp;
+ u8 *iv;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ int err = -EINVAL;
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ goto out;
+
+ if (elen <= 0)
+ goto out;
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ err = -ENOMEM;
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp)
+ goto out;
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ asg = esp_req_sg(aead, req);
+ sg = asg + sglists;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr *)skb->data;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ iv = esph->enc_data;
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ aead_request_set_crypt(req, sg, sg, elen, iv);
+ aead_request_set_assoc(req, asg, assoclen);
+
+ err = crypto_aead_decrypt(req);
+ if (err == -EINPROGRESS)
+ goto out;
+
+ err = esp_input_done2(skb, err);
+
+out:
+ return err;
+}
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
+{
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ unsigned int net_adj;
+
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ case XFRM_MODE_BEET:
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
+ break;
+ default:
+ BUG();
+ }
+
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
+}
+
+static int esp4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+ struct crypto_aead *aead = x->data;
+
+ if (!aead)
+ return;
+
+ crypto_free_aead(aead);
+}
+
+static int esp_init_aead(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ int err;
+
+ aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+error:
+ return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
+
+ err = -EINVAL;
+ if (!x->ealg)
+ goto error;
+
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
+ goto error;
+
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
+
+ if (x->aalg) {
+ struct xfrm_algo_desc *aalg_desc;
+
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
+
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ err = -EINVAL;
+ if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+ crypto_aead_authsize(aead)) {
+ pr_info("ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_aead_authsize(aead),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
+ goto free_key;
+ }
+
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err)
+ goto free_key;
+ }
+
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree(key);
+
+error:
+ return err;
+}
+
+static int esp_init_state(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ x->data = NULL;
+
+ if (x->aead)
+ err = esp_init_aead(x);
+ else
+ err = esp_init_authenc(x);
+
+ if (err)
+ goto error;
+
+ aead = x->data;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ x->props.header_len += sizeof(struct iphdr);
+ else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
+ x->props.header_len += IPV4_BEET_PHMAXLEN;
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+ break;
+ }
+ }
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
+
+error:
+ return err;
+}
+
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type esp_type =
+{
+ .description = "ESP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = esp_init_state,
+ .destructor = esp_destroy,
+ .get_mtu = esp4_get_mtu,
+ .input = esp_input,
+ .output = esp_output
+};
+
+static struct xfrm4_protocol esp4_protocol = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp4_rcv_cb,
+ .err_handler = esp4_err,
+ .priority = 0,
+};
+
+static int __init esp4_init(void)
+{
+ if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&esp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+ if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 000000000..872494e6e
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,1251 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/xfrm.h>
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+static int __net_init fib4_rules_init(struct net *net)
+{
+ struct fib_table *local_table, *main_table;
+
+ main_table = fib_trie_table(RT_TABLE_MAIN, NULL);
+ if (!main_table)
+ return -ENOMEM;
+
+ local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
+ if (!local_table)
+ goto fail;
+
+ hlist_add_head_rcu(&local_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+ hlist_add_head_rcu(&main_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+ return 0;
+
+fail:
+ fib_free_table(main_table);
+ return -ENOMEM;
+}
+#else
+
+struct fib_table *fib_new_table(struct net *net, u32 id)
+{
+ struct fib_table *tb, *alias = NULL;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT_TABLE_MAIN;
+ tb = fib_get_table(net, id);
+ if (tb)
+ return tb;
+
+ if (id == RT_TABLE_LOCAL)
+ alias = fib_new_table(net, RT_TABLE_MAIN);
+
+ tb = fib_trie_table(id, alias);
+ if (!tb)
+ return NULL;
+
+ switch (id) {
+ case RT_TABLE_LOCAL:
+ rcu_assign_pointer(net->ipv4.fib_local, tb);
+ break;
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, tb);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, tb);
+ break;
+ default:
+ break;
+ }
+
+ h = id & (FIB_TABLE_HASHSZ - 1);
+ hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
+ return tb;
+}
+
+/* caller must hold either rtnl or rcu read lock */
+struct fib_table *fib_get_table(struct net *net, u32 id)
+{
+ struct fib_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT_TABLE_MAIN;
+ h = id & (FIB_TABLE_HASHSZ - 1);
+
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ if (tb->tb_id == id)
+ return tb;
+ }
+ return NULL;
+}
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+static void fib_replace_table(struct net *net, struct fib_table *old,
+ struct fib_table *new)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ switch (new->tb_id) {
+ case RT_TABLE_LOCAL:
+ rcu_assign_pointer(net->ipv4.fib_local, new);
+ break;
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, new);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, new);
+ break;
+ default:
+ break;
+ }
+
+#endif
+ /* replace the old table in the hlist */
+ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
+}
+
+int fib_unmerge(struct net *net)
+{
+ struct fib_table *old, *new;
+
+ /* attempt to fetch local table if it has been allocated */
+ old = fib_get_table(net, RT_TABLE_LOCAL);
+ if (!old)
+ return 0;
+
+ new = fib_trie_unmerge(old);
+ if (!new)
+ return -ENOMEM;
+
+ /* replace merged table with clean table */
+ if (new != old) {
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+ }
+
+ return 0;
+}
+
+static void fib_flush(struct net *net)
+{
+ int flushed = 0;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+ flushed += fib_table_flush(tb);
+ }
+
+ if (flushed)
+ rt_cache_flush(net);
+}
+
+void fib_flush_external(struct net *net)
+{
+ struct fib_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, head, tb_hlist)
+ fib_table_flush_external(tb);
+ }
+}
+
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
+{
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res;
+ unsigned int ret = RTN_BROADCAST;
+ struct fib_table *local_table;
+
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+ return RTN_BROADCAST;
+ if (ipv4_is_multicast(addr))
+ return RTN_MULTICAST;
+
+ rcu_read_lock();
+
+ local_table = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local_table) {
+ ret = RTN_UNICAST;
+ if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ if (!dev || dev == res.fi->fib_dev)
+ ret = res.type;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+ return __inet_dev_addr_type(net, NULL, addr);
+}
+EXPORT_SYMBOL(inet_addr_type);
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+ __be32 addr)
+{
+ return __inet_dev_addr_type(net, dev, addr);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ struct net *net;
+ int scope;
+
+ rt = skb_rtable(skb);
+ if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+ RTCF_LOCAL)
+ return ip_hdr(skb)->daddr;
+
+ in_dev = __in_dev_get_rcu(dev);
+ BUG_ON(!in_dev);
+
+ net = dev_net(dev);
+
+ scope = RT_SCOPE_UNIVERSE;
+ if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ fl4.daddr = ip_hdr(skb)->saddr;
+ fl4.saddr = 0;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_scope = scope;
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ if (!fib_lookup(net, &fl4, &res))
+ return FIB_RES_PREFSRC(net, res);
+ } else {
+ scope = RT_SCOPE_LINK;
+ }
+
+ return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ int rpf, struct in_device *idev, u32 *itag)
+{
+ int ret, no_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ struct net *net;
+ bool dev_match;
+
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ no_addr = idev->ifa_list == NULL;
+
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+
+ net = dev_net(dev);
+ if (fib_lookup(net, &fl4, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST &&
+ (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
+ goto e_inval;
+ if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
+ goto last_resort;
+ fib_combine_itag(itag, &res);
+ dev_match = false;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
+#endif
+ if (dev_match) {
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ return ret;
+ }
+ if (no_addr)
+ goto last_resort;
+ if (rpf == 1)
+ goto e_rpf;
+ fl4.flowi4_oif = dev->ifindex;
+
+ ret = 0;
+ if (fib_lookup(net, &fl4, &res) == 0) {
+ if (res.type == RTN_UNICAST)
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ return ret;
+
+last_resort:
+ if (rpf)
+ goto e_rpf;
+ *itag = 0;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+e_rpf:
+ return -EXDEV;
+}
+
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ struct in_device *idev, u32 *itag)
+{
+ int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+
+ if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
+ IN_DEV_ACCEPT_LOCAL(idev) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+ *itag = 0;
+ return 0;
+ }
+ return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
+
+static inline __be32 sk_extract_addr(struct sockaddr *addr)
+{
+ return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+}
+
+static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
+{
+ struct nlattr *nla;
+
+ nla = (struct nlattr *) ((char *) mx + len);
+ nla->nla_type = type;
+ nla->nla_len = nla_attr_size(4);
+ *(u32 *) nla_data(nla) = value;
+
+ return len + nla_total_size(4);
+}
+
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
+ struct fib_config *cfg)
+{
+ __be32 addr;
+ int plen;
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (rt->rt_dst.sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /*
+ * Check mask for validity:
+ * a) it must be contiguous.
+ * b) destination must have all host bits clear.
+ * c) if application forgot to set correct family (AF_INET),
+ * reject request unless it is absolutely clear i.e.
+ * both family and mask are zero.
+ */
+ plen = 32;
+ addr = sk_extract_addr(&rt->rt_dst);
+ if (!(rt->rt_flags & RTF_HOST)) {
+ __be32 mask = sk_extract_addr(&rt->rt_genmask);
+
+ if (rt->rt_genmask.sa_family != AF_INET) {
+ if (mask || rt->rt_genmask.sa_family)
+ return -EAFNOSUPPORT;
+ }
+
+ if (bad_mask(mask, addr))
+ return -EINVAL;
+
+ plen = inet_mask_len(mask);
+ }
+
+ cfg->fc_dst_len = plen;
+ cfg->fc_dst = addr;
+
+ if (cmd != SIOCDELRT) {
+ cfg->fc_nlflags = NLM_F_CREATE;
+ cfg->fc_protocol = RTPROT_BOOT;
+ }
+
+ if (rt->rt_metric)
+ cfg->fc_priority = rt->rt_metric - 1;
+
+ if (rt->rt_flags & RTF_REJECT) {
+ cfg->fc_scope = RT_SCOPE_HOST;
+ cfg->fc_type = RTN_UNREACHABLE;
+ return 0;
+ }
+
+ cfg->fc_scope = RT_SCOPE_NOWHERE;
+ cfg->fc_type = RTN_UNICAST;
+
+ if (rt->rt_dev) {
+ char *colon;
+ struct net_device *dev;
+ char devname[IFNAMSIZ];
+
+ if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+
+ devname[IFNAMSIZ-1] = 0;
+ colon = strchr(devname, ':');
+ if (colon)
+ *colon = 0;
+ dev = __dev_get_by_name(net, devname);
+ if (!dev)
+ return -ENODEV;
+ cfg->fc_oif = dev->ifindex;
+ if (colon) {
+ struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return -ENODEV;
+ *colon = ':';
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ if (strcmp(ifa->ifa_label, devname) == 0)
+ break;
+ if (!ifa)
+ return -ENODEV;
+ cfg->fc_prefsrc = ifa->ifa_local;
+ }
+ }
+
+ addr = sk_extract_addr(&rt->rt_gateway);
+ if (rt->rt_gateway.sa_family == AF_INET && addr) {
+ cfg->fc_gw = addr;
+ if (rt->rt_flags & RTF_GATEWAY &&
+ inet_addr_type(net, addr) == RTN_UNICAST)
+ cfg->fc_scope = RT_SCOPE_UNIVERSE;
+ }
+
+ if (cmd == SIOCDELRT)
+ return 0;
+
+ if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+ return -EINVAL;
+
+ if (cfg->fc_scope == RT_SCOPE_NOWHERE)
+ cfg->fc_scope = RT_SCOPE_LINK;
+
+ if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
+ struct nlattr *mx;
+ int len = 0;
+
+ mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
+ if (!mx)
+ return -ENOMEM;
+
+ if (rt->rt_flags & RTF_MTU)
+ len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
+
+ if (rt->rt_flags & RTF_WINDOW)
+ len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
+
+ if (rt->rt_flags & RTF_IRTT)
+ len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
+
+ cfg->fc_mx = mx;
+ cfg->fc_mx_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
+ */
+int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct fib_config cfg;
+ struct rtentry rt;
+ int err;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&rt, arg, sizeof(rt)))
+ return -EFAULT;
+
+ rtnl_lock();
+ err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
+ if (err == 0) {
+ struct fib_table *tb;
+
+ if (cmd == SIOCDELRT) {
+ tb = fib_get_table(net, cfg.fc_table);
+ if (tb)
+ err = fib_table_delete(tb, &cfg);
+ else
+ err = -ESRCH;
+ } else {
+ tb = fib_new_table(net, cfg.fc_table);
+ if (tb)
+ err = fib_table_insert(tb, &cfg);
+ else
+ err = -ENOBUFS;
+ }
+
+ /* allocated by rtentry_to_fib_config() */
+ kfree(cfg.fc_mx);
+ }
+ rtnl_unlock();
+ return err;
+ }
+ return -EINVAL;
+}
+
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_SRC] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+ [RTA_GATEWAY] = { .type = NLA_U32 },
+ [RTA_PRIORITY] = { .type = NLA_U32 },
+ [RTA_PREFSRC] = { .type = NLA_U32 },
+ [RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_FLOW] = { .type = NLA_U32 },
+};
+
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct fib_config *cfg)
+{
+ struct nlattr *attr;
+ int err, remaining;
+ struct rtmsg *rtm;
+
+ err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ memset(cfg, 0, sizeof(*cfg));
+
+ rtm = nlmsg_data(nlh);
+ cfg->fc_dst_len = rtm->rtm_dst_len;
+ cfg->fc_tos = rtm->rtm_tos;
+ cfg->fc_table = rtm->rtm_table;
+ cfg->fc_protocol = rtm->rtm_protocol;
+ cfg->fc_scope = rtm->rtm_scope;
+ cfg->fc_type = rtm->rtm_type;
+ cfg->fc_flags = rtm->rtm_flags;
+ cfg->fc_nlflags = nlh->nlmsg_flags;
+
+ cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (cfg->fc_type > RTN_MAX) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
+ switch (nla_type(attr)) {
+ case RTA_DST:
+ cfg->fc_dst = nla_get_be32(attr);
+ break;
+ case RTA_OIF:
+ cfg->fc_oif = nla_get_u32(attr);
+ break;
+ case RTA_GATEWAY:
+ cfg->fc_gw = nla_get_be32(attr);
+ break;
+ case RTA_PRIORITY:
+ cfg->fc_priority = nla_get_u32(attr);
+ break;
+ case RTA_PREFSRC:
+ cfg->fc_prefsrc = nla_get_be32(attr);
+ break;
+ case RTA_METRICS:
+ cfg->fc_mx = nla_data(attr);
+ cfg->fc_mx_len = nla_len(attr);
+ break;
+ case RTA_MULTIPATH:
+ cfg->fc_mp = nla_data(attr);
+ cfg->fc_mp_len = nla_len(attr);
+ break;
+ case RTA_FLOW:
+ cfg->fc_flow = nla_get_u32(attr);
+ break;
+ case RTA_TABLE:
+ cfg->fc_table = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ return 0;
+errout:
+ return err;
+}
+
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_config cfg;
+ struct fib_table *tb;
+ int err;
+
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
+ if (err < 0)
+ goto errout;
+
+ tb = fib_get_table(net, cfg.fc_table);
+ if (!tb) {
+ err = -ESRCH;
+ goto errout;
+ }
+
+ err = fib_table_delete(tb, &cfg);
+errout:
+ return err;
+}
+
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_config cfg;
+ struct fib_table *tb;
+ int err;
+
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
+ if (err < 0)
+ goto errout;
+
+ tb = fib_new_table(net, cfg.fc_table);
+ if (!tb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = fib_table_insert(tb, &cfg);
+errout:
+ return err;
+}
+
+static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int h, s_h;
+ unsigned int e = 0, s_e;
+ struct fib_table *tb;
+ struct hlist_head *head;
+ int dumped = 0;
+
+ if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+ return skb->len;
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
+ rcu_read_lock();
+
+ for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+ e = 0;
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ if (e < s_e)
+ goto next;
+ if (dumped)
+ memset(&cb->args[2], 0, sizeof(cb->args) -
+ 2 * sizeof(cb->args[0]));
+ if (fib_table_dump(tb, skb, cb) < 0)
+ goto out;
+ dumped = 1;
+next:
+ e++;
+ }
+ }
+out:
+ rcu_read_unlock();
+
+ cb->args[1] = e;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
+ */
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+ struct net *net = dev_net(ifa->ifa_dev->dev);
+ struct fib_table *tb;
+ struct fib_config cfg = {
+ .fc_protocol = RTPROT_KERNEL,
+ .fc_type = type,
+ .fc_dst = dst,
+ .fc_dst_len = dst_len,
+ .fc_prefsrc = ifa->ifa_local,
+ .fc_oif = ifa->ifa_dev->dev->ifindex,
+ .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+ .fc_nlinfo = {
+ .nl_net = net,
+ },
+ };
+
+ if (type == RTN_UNICAST)
+ tb = fib_new_table(net, RT_TABLE_MAIN);
+ else
+ tb = fib_new_table(net, RT_TABLE_LOCAL);
+
+ if (!tb)
+ return;
+
+ cfg.fc_table = tb->tb_id;
+
+ if (type != RTN_LOCAL)
+ cfg.fc_scope = RT_SCOPE_LINK;
+ else
+ cfg.fc_scope = RT_SCOPE_HOST;
+
+ if (cmd == RTM_NEWROUTE)
+ fib_table_insert(tb, &cfg);
+ else
+ fib_table_delete(tb, &cfg);
+}
+
+void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ __be32 mask = ifa->ifa_mask;
+ __be32 addr = ifa->ifa_local;
+ __be32 prefix = ifa->ifa_address & mask;
+
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (!prim) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
+ return;
+ }
+ }
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+ if (!(dev->flags & IFF_UP))
+ return;
+
+ /* Add broadcast address, if it is explicitly assigned. */
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
+
+ /* Add network specific broadcasts, when it takes a sense */
+ if (ifa->ifa_prefixlen < 31) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
+ }
+ }
+}
+
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *ifa1;
+ struct in_ifaddr *prim = ifa, *prim1 = NULL;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
+#define LOCAL_OK 1
+#define BRD_OK 2
+#define BRD0_OK 4
+#define BRD1_OK 8
+ unsigned int ok = 0;
+ int subnet = 0; /* Primary network */
+ int gone = 1; /* Address is missing */
+ int same_prefsrc = 0; /* Another primary with same IP */
+
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (!prim) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
+ return;
+ }
+ if (iprim && iprim != prim) {
+ pr_warn("%s: bug: iprim != prim\n", __func__);
+ return;
+ }
+ } else if (!ipv4_is_zeronet(any) &&
+ (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
+ subnet = 1;
+ }
+
+ /* Deletion is more complicated than add.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
+ */
+
+ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa1 == ifa) {
+ /* promotion, keep the IP */
+ gone = 0;
+ continue;
+ }
+ /* Ignore IFAs from our subnet */
+ if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, iprim))
+ continue;
+
+ /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+ if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+ /* Another address from our subnet? */
+ if (ifa1->ifa_mask == prim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, prim))
+ prim1 = prim;
+ else {
+ /* We reached the secondaries, so
+ * same_prefsrc should be determined.
+ */
+ if (!same_prefsrc)
+ continue;
+ /* Search new prim1 if ifa1 is not
+ * using the current prim1
+ */
+ if (!prim1 ||
+ ifa1->ifa_mask != prim1->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, prim1))
+ prim1 = inet_ifa_byprefix(in_dev,
+ ifa1->ifa_address,
+ ifa1->ifa_mask);
+ if (!prim1)
+ continue;
+ if (prim1->ifa_local != prim->ifa_local)
+ continue;
+ }
+ } else {
+ if (prim->ifa_local != ifa1->ifa_local)
+ continue;
+ prim1 = ifa1;
+ if (prim != prim1)
+ same_prefsrc = 1;
+ }
+ if (ifa->ifa_local == ifa1->ifa_local)
+ ok |= LOCAL_OK;
+ if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+ ok |= BRD_OK;
+ if (brd == ifa1->ifa_broadcast)
+ ok |= BRD1_OK;
+ if (any == ifa1->ifa_broadcast)
+ ok |= BRD0_OK;
+ /* primary has network specific broadcasts */
+ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+ __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+ __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+ if (!ipv4_is_zeronet(any1)) {
+ if (ifa->ifa_broadcast == brd1 ||
+ ifa->ifa_broadcast == any1)
+ ok |= BRD_OK;
+ if (brd == brd1 || brd == any1)
+ ok |= BRD1_OK;
+ if (any == brd1 || any == any1)
+ ok |= BRD0_OK;
+ }
+ }
+ }
+
+ if (!(ok & BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (subnet && ifa->ifa_prefixlen < 31) {
+ if (!(ok & BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok & BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ }
+ if (!(ok & LOCAL_OK)) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+ /* Check, that this local address finally disappeared. */
+ if (gone &&
+ inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+ /* And the last, but not the least thing.
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ fib_flush(dev_net(dev));
+ }
+ }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
+{
+
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_tos = frn->fl_tos,
+ .flowi4_scope = frn->fl_scope,
+ };
+ struct fib_table *tb;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(net, frn->tb_id_in);
+
+ frn->err = -ENOENT;
+ if (tb) {
+ local_bh_disable();
+
+ frn->tb_id = tb->tb_id;
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+
+ if (!frn->err) {
+ frn->prefixlen = res.prefixlen;
+ frn->nh_sel = res.nh_sel;
+ frn->type = res.type;
+ frn->scope = res.scope;
+ }
+ local_bh_enable();
+ }
+
+ rcu_read_unlock();
+}
+
+static void nl_fib_input(struct sk_buff *skb)
+{
+ struct net *net;
+ struct fib_result_nl *frn;
+ struct nlmsghdr *nlh;
+ u32 portid;
+
+ net = sock_net(skb->sk);
+ nlh = nlmsg_hdr(skb);
+ if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
+ return;
+
+ skb = netlink_skb_clone(skb, GFP_KERNEL);
+ if (!skb)
+ return;
+ nlh = nlmsg_hdr(skb);
+
+ frn = (struct fib_result_nl *) nlmsg_data(nlh);
+ nl_fib_lookup(net, frn);
+
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
+ NETLINK_CB(skb).portid = 0; /* from kernel */
+ NETLINK_CB(skb).dst_group = 0; /* unicast */
+ netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
+}
+
+static int __net_init nl_fib_lookup_init(struct net *net)
+{
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .input = nl_fib_input,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
+ if (!sk)
+ return -EAFNOSUPPORT;
+ net->ipv4.fibnl = sk;
+ return 0;
+}
+
+static void nl_fib_lookup_exit(struct net *net)
+{
+ netlink_kernel_release(net->ipv4.fibnl);
+ net->ipv4.fibnl = NULL;
+}
+
+static void fib_disable_ip(struct net_device *dev, int force)
+{
+ if (fib_sync_down_dev(dev, force))
+ fib_flush(dev_net(dev));
+ rt_cache_flush(dev_net(dev));
+ arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+
+ switch (event) {
+ case NETDEV_UP:
+ fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_DOWN:
+ fib_del_ifaddr(ifa, NULL);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ if (!ifa->ifa_dev->ifa_list) {
+ /* Last address was deleted from this interface.
+ * Disable IP.
+ */
+ fib_disable_ip(dev, 1);
+ } else {
+ rt_cache_flush(dev_net(dev));
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_UNREGISTER) {
+ fib_disable_ip(dev, 2);
+ rt_flush_dev(dev);
+ return NOTIFY_DONE;
+ }
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ for_ifa(in_dev) {
+ fib_add_ifaddr(ifa);
+ } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(net);
+ break;
+ case NETDEV_DOWN:
+ fib_disable_ip(dev, 0);
+ break;
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ rt_cache_flush(net);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+ .notifier_call = fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+ .notifier_call = fib_netdev_event,
+};
+
+static int __net_init ip_fib_net_init(struct net *net)
+{
+ int err;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv4.fib_table_hash)
+ return -ENOMEM;
+
+ err = fib4_rules_init(net);
+ if (err < 0)
+ goto fail;
+ return 0;
+
+fail:
+ kfree(net->ipv4.fib_table_hash);
+ return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
+{
+ unsigned int i;
+
+ rtnl_lock();
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+#endif
+ for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[i];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
+ fib_table_flush(tb);
+ fib_free_table(tb);
+ }
+ }
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
+#endif
+ rtnl_unlock();
+ kfree(net->ipv4.fib_table_hash);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+ int error;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ net->ipv4.fib_num_tclassid_users = 0;
+#endif
+ error = ip_fib_net_init(net);
+ if (error < 0)
+ goto out;
+ error = nl_fib_lookup_init(net);
+ if (error < 0)
+ goto out_nlfl;
+ error = fib_proc_init(net);
+ if (error < 0)
+ goto out_proc;
+out:
+ return error;
+
+out_proc:
+ nl_fib_lookup_exit(net);
+out_nlfl:
+ ip_fib_net_exit(net);
+ goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+ fib_proc_exit(net);
+ nl_fib_lookup_exit(net);
+ ip_fib_net_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+ .init = fib_net_init,
+ .exit = fib_net_exit,
+};
+
+void __init ip_fib_init(void)
+{
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+
+ register_pernet_subsys(&fib_net_ops);
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+
+ fib_trie_init();
+}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 000000000..c6211ed60
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,52 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+
+struct fib_alias {
+ struct hlist_node fa_list;
+ struct fib_info *fa_info;
+ u8 fa_tos;
+ u8 fa_type;
+ u8 fa_state;
+ u8 fa_slen;
+ u32 tb_id;
+ struct rcu_head rcu;
+};
+
+#define FA_S_ACCESSED 0x01
+
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
+/* Exported by fib_semantics.c */
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
+ u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+
+static inline void fib_result_assign(struct fib_result *res,
+ struct fib_info *fi)
+{
+ /* we used to play games with refcounts, but we now use RCU */
+ res->fi = fi;
+}
+
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 000000000..56151982f
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,369 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: policy rules.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Thomas Graf <tgraf@suug.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip_fib.h>
+#include <net/fib_rules.h>
+
+struct fib4_rule {
+ struct fib_rule common;
+ u8 dst_len;
+ u8 src_len;
+ u8 tos;
+ __be32 src;
+ __be32 srcmask;
+ __be32 dst;
+ __be32 dstmask;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ u32 tclassid;
+#endif
+};
+
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+{
+ struct fib_lookup_arg arg = {
+ .result = res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+ int err;
+
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (arg.rule)
+ res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+ else
+ res->tclassid = 0;
+#endif
+
+ if (err == -ESRCH)
+ err = -ENETUNREACH;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__fib_lookup);
+
+static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ int err = -EAGAIN;
+ struct fib_table *tbl;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (tbl)
+ err = fib_table_lookup(tbl, &flp->u.ip4,
+ (struct fib_result *)arg->result,
+ arg->flags);
+
+ rcu_read_unlock();
+ return err;
+}
+
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+ struct fib_result *result = (struct fib_result *) arg->result;
+ struct net_device *dev = NULL;
+
+ if (result->fi)
+ dev = result->fi->fib_dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (result->prefixlen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ fib_info_put(result->fi);
+ return true;
+}
+
+static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ struct fib4_rule *r = (struct fib4_rule *) rule;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
+
+ if (((saddr ^ r->src) & r->srcmask) ||
+ ((daddr ^ r->dst) & r->dstmask))
+ return 0;
+
+ if (r->tos && (r->tos != fl4->flowi4_tos))
+ return 0;
+
+ return 1;
+}
+
+static struct fib_table *fib_empty_table(struct net *net)
+{
+ u32 id;
+
+ for (id = 1; id <= RT_TABLE_MAX; id++)
+ if (!fib_get_table(net, id))
+ return fib_new_table(net, id);
+ return NULL;
+}
+
+static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
+ FRA_GENERIC_POLICY,
+ [FRA_FLOW] = { .type = NLA_U32 },
+};
+
+static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct net *net = sock_net(skb->sk);
+ int err = -EINVAL;
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (frh->tos & ~IPTOS_TOS_MASK)
+ goto errout;
+
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+ if (rule->table == RT_TABLE_UNSPEC) {
+ if (rule->action == FR_ACT_TO_TBL) {
+ struct fib_table *table;
+
+ table = fib_empty_table(net);
+ if (!table) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ rule->table = table->tb_id;
+ }
+ }
+
+ if (frh->src_len)
+ rule4->src = nla_get_in_addr(tb[FRA_SRC]);
+
+ if (frh->dst_len)
+ rule4->dst = nla_get_in_addr(tb[FRA_DST]);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW]) {
+ rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users++;
+ }
+#endif
+
+ rule4->src_len = frh->src_len;
+ rule4->srcmask = inet_make_mask(rule4->src_len);
+ rule4->dst_len = frh->dst_len;
+ rule4->dstmask = inet_make_mask(rule4->dst_len);
+ rule4->tos = frh->tos;
+
+ net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib4_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+ int err;
+
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (((struct fib4_rule *)rule)->tclassid)
+ net->ipv4.fib_num_tclassid_users--;
+#endif
+ net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+errout:
+ return err;
+}
+
+static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (frh->src_len && (rule4->src_len != frh->src_len))
+ return 0;
+
+ if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+ return 0;
+
+ if (frh->tos && (rule4->tos != frh->tos))
+ return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
+ return 0;
+#endif
+
+ if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
+ return 0;
+
+ if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
+ return 0;
+
+ return 1;
+}
+
+static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ frh->dst_len = rule4->dst_len;
+ frh->src_len = rule4->src_len;
+ frh->tos = rule4->tos;
+
+ if ((rule4->dst_len &&
+ nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_in_addr(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
+#endif
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+{
+ return nla_total_size(4) /* dst */
+ + nla_total_size(4) /* src */
+ + nla_total_size(4); /* flow */
+}
+
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+{
+ rt_cache_flush(ops->fro_net);
+}
+
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
+ .family = AF_INET,
+ .rule_size = sizeof(struct fib4_rule),
+ .addr_size = sizeof(u32),
+ .action = fib4_rule_action,
+ .suppress = fib4_rule_suppress,
+ .match = fib4_rule_match,
+ .configure = fib4_rule_configure,
+ .delete = fib4_rule_delete,
+ .compare = fib4_rule_compare,
+ .fill = fib4_rule_fill,
+ .default_pref = fib_default_rule_pref,
+ .nlmsg_payload = fib4_rule_nlmsg_payload,
+ .flush_cache = fib4_rule_flush_cache,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .policy = fib4_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+ int err;
+
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
+{
+ int err;
+ struct fib_rules_ops *ops;
+
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ err = fib_default_rules_init(ops);
+ if (err < 0)
+ goto fail;
+ net->ipv4.rules_ops = ops;
+ net->ipv4.fib_has_custom_rules = false;
+ return 0;
+
+fail:
+ /* also cleans all rules already added */
+ fib_rules_unregister(ops);
+ return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+ fib_rules_unregister(net->ipv4.rules_ops);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 000000000..8d695b665
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1328 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: semantics.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/netlink.h>
+#include <net/nexthop.h>
+
+#include "fib_lookup.h"
+
+static DEFINE_SPINLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_info_hash_size;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static DEFINE_SPINLOCK(fib_multipath_lock);
+
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < (fi)->fib_nhs; \
+ nh++, nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < (fi)->fib_nhs; \
+ nexthop_nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+ [RTN_UNSPEC] = {
+ .error = 0,
+ .scope = RT_SCOPE_NOWHERE,
+ },
+ [RTN_UNICAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_LOCAL] = {
+ .error = 0,
+ .scope = RT_SCOPE_HOST,
+ },
+ [RTN_BROADCAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ },
+ [RTN_ANYCAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ },
+ [RTN_MULTICAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_BLACKHOLE] = {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_UNREACHABLE] = {
+ .error = -EHOSTUNREACH,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_PROHIBIT] = {
+ .error = -EACCES,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_THROW] = {
+ .error = -EAGAIN,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_NAT] = {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ },
+ [RTN_XRESOLVE] = {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ },
+};
+
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+ struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+ if (!rt)
+ return;
+
+ /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+ * because we waited an RCU grace period before calling
+ * free_fib_info_rcu()
+ */
+
+ dst_free(&rt->dst);
+}
+
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+ struct fnhe_hash_bucket *hash;
+ int i;
+
+ hash = rcu_dereference_protected(nh->nh_exceptions, 1);
+ if (!hash)
+ return;
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = rcu_dereference_protected(hash[i].chain, 1);
+ while (fnhe) {
+ struct fib_nh_exception *next;
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+ kfree(fnhe);
+
+ fnhe = next;
+ }
+ }
+ kfree(hash);
+}
+
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+ int cpu;
+
+ if (!rtp)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rtable *rt;
+
+ rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+ if (rt)
+ dst_free(&rt->dst);
+ }
+ free_percpu(rtp);
+}
+
+/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ free_nh_exceptions(nexthop_nh);
+ rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
+ rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+ } endfor_nexthops(fi);
+
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
+ kfree(fi);
+}
+
+void free_fib_info(struct fib_info *fi)
+{
+ if (fi->fib_dead == 0) {
+ pr_warn("Freeing alive fib_info %p\n", fi);
+ return;
+ }
+ fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users--;
+ } endfor_nexthops(fi);
+#endif
+ call_rcu(&fi->rcu, free_fib_info_rcu);
+}
+
+void fib_release_info(struct fib_info *fi)
+{
+ spin_lock_bh(&fib_info_lock);
+ if (fi && --fi->fib_treeref == 0) {
+ hlist_del(&fi->fib_hash);
+ if (fi->fib_prefsrc)
+ hlist_del(&fi->fib_lhash);
+ change_nexthops(fi) {
+ if (!nexthop_nh->nh_dev)
+ continue;
+ hlist_del(&nexthop_nh->nh_hash);
+ } endfor_nexthops(fi)
+ fi->fib_dead = 1;
+ fib_info_put(fi);
+ }
+ spin_unlock_bh(&fib_info_lock);
+}
+
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+ const struct fib_nh *onh = ofi->fib_nh;
+
+ for_nexthops(fi) {
+ if (nh->nh_oif != onh->nh_oif ||
+ nh->nh_gw != onh->nh_gw ||
+ nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+ return -1;
+ onh++;
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+ (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+ unsigned int mask = (fib_info_hash_size - 1);
+ unsigned int val = fi->fib_nhs;
+
+ val ^= (fi->fib_protocol << 8) | fi->fib_scope;
+ val ^= (__force u32)fi->fib_prefsrc;
+ val ^= fi->fib_priority;
+ for_nexthops(fi) {
+ val ^= fib_devindex_hashfn(nh->nh_oif);
+ } endfor_nexthops(fi)
+
+ return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+ struct hlist_head *head;
+ struct fib_info *fi;
+ unsigned int hash;
+
+ hash = fib_info_hashfn(nfi);
+ head = &fib_info_hash[hash];
+
+ hlist_for_each_entry(fi, head, fib_hash) {
+ if (!net_eq(fi->fib_net, nfi->fib_net))
+ continue;
+ if (fi->fib_nhs != nfi->fib_nhs)
+ continue;
+ if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_scope == fi->fib_scope &&
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX) == 0 &&
+ ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+ (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ return fi;
+ }
+
+ return NULL;
+}
+
+/* Check, that the gateway is already configured.
+ * Used only by redirect accept routine.
+ */
+int ip_fib_check_default(__be32 gw, struct net_device *dev)
+{
+ struct hlist_head *head;
+ struct fib_nh *nh;
+ unsigned int hash;
+
+ spin_lock(&fib_info_lock);
+
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_for_each_entry(nh, head, nh_hash) {
+ if (nh->nh_dev == dev &&
+ nh->nh_gw == gw &&
+ !(nh->nh_flags & RTNH_F_DEAD)) {
+ spin_unlock(&fib_info_lock);
+ return 0;
+ }
+ }
+
+ spin_unlock(&fib_info_lock);
+
+ return -1;
+}
+
+static inline size_t fib_nlmsg_size(struct fib_info *fi)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_DST */
+ + nla_total_size(4) /* RTA_PRIORITY */
+ + nla_total_size(4) /* RTA_PREFSRC */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+
+ /* space for nested metrics */
+ payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
+
+ if (fi->fib_nhs) {
+ /* Also handles the special case fib_nhs == 1 */
+
+ /* each nexthop is packed in an attribute */
+ size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+
+ /* may contain flow and gateway attribute */
+ nhsize += 2 * nla_total_size(4);
+
+ /* all nexthops are packed in a nested attribute */
+ payload += nla_total_size(fi->fib_nhs * nhsize);
+ }
+
+ return payload;
+}
+
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+ int dst_len, u32 tb_id, const struct nl_info *info,
+ unsigned int nlm_flags)
+{
+ struct sk_buff *skb;
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = fib_dump_info(skb, info->portid, seq, event, tb_id,
+ fa->fa_type, key, dst_len,
+ fa->fa_tos, fa->fa_info, nlm_flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+}
+
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx,
+ int dflt)
+{
+ struct neighbour *n;
+ int state = NUD_NONE;
+
+ n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (n) {
+ state = n->nud_state;
+ neigh_release(n);
+ }
+ if (state == NUD_REACHABLE)
+ return 0;
+ if ((state & NUD_VALID) && order != dflt)
+ return 0;
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt)) {
+ *last_resort = fi;
+ *last_idx = order;
+ }
+ return 1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
+{
+ int nhs = 0;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ nhs++;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ /* leftover implies invalid nexthop configuration, discard it */
+ return remaining > 0 ? 0 : nhs;
+}
+
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+ int remaining, struct fib_config *cfg)
+{
+ change_nexthops(fi) {
+ int attrlen;
+
+ if (!rtnh_ok(rtnh, remaining))
+ return -EINVAL;
+
+ nexthop_nh->nh_flags =
+ (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+ nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
+ nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nla = nla_find(attrs, attrlen, RTA_FLOW);
+ nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
+#endif
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } endfor_nexthops(fi);
+
+ return 0;
+}
+
+#endif
+
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ struct rtnexthop *rtnh;
+ int remaining;
+#endif
+
+ if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
+ return 1;
+
+ if (cfg->fc_oif || cfg->fc_gw) {
+ if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
+ (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
+ return 0;
+ return 1;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (!cfg->fc_mp)
+ return 0;
+
+ rtnh = cfg->fc_mp;
+ remaining = cfg->fc_mp_len;
+
+ for_nexthops(fi) {
+ int attrlen;
+
+ if (!rtnh_ok(rtnh, remaining))
+ return -EINVAL;
+
+ if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
+ return 1;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla && nla_get_in_addr(nla) != nh->nh_gw)
+ return 1;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nla = nla_find(attrs, attrlen, RTA_FLOW);
+ if (nla && nla_get_u32(nla) != nh->nh_tclassid)
+ return 1;
+#endif
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } endfor_nexthops(fi);
+#endif
+ return 0;
+}
+
+
+/*
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
+ */
+static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
+ struct fib_nh *nh)
+{
+ int err;
+ struct net *net;
+ struct net_device *dev;
+
+ net = cfg->fc_nlinfo.nl_net;
+ if (nh->nh_gw) {
+ struct fib_result res;
+
+ if (nh->nh_flags & RTNH_F_ONLINK) {
+
+ if (cfg->fc_scope >= RT_SCOPE_LINK)
+ return -EINVAL;
+ if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
+ return -EINVAL;
+ dev = __dev_get_by_index(net, nh->nh_oif);
+ if (!dev)
+ return -ENODEV;
+ if (!(dev->flags & IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ dev_hold(dev);
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ rcu_read_lock();
+ {
+ struct flowi4 fl4 = {
+ .daddr = nh->nh_gw,
+ .flowi4_scope = cfg->fc_scope + 1,
+ .flowi4_oif = nh->nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+ err = fib_lookup(net, &fl4, &res);
+ if (err) {
+ rcu_read_unlock();
+ return err;
+ }
+ }
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+ goto out;
+ nh->nh_scope = res.scope;
+ nh->nh_oif = FIB_RES_OIF(res);
+ nh->nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev)
+ goto out;
+ dev_hold(dev);
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
+ } else {
+ struct in_device *in_dev;
+
+ if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
+ return -EINVAL;
+
+ rcu_read_lock();
+ err = -ENODEV;
+ in_dev = inetdev_by_index(net, nh->nh_oif);
+ if (!in_dev)
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP))
+ goto out;
+ nh->nh_dev = in_dev->dev;
+ dev_hold(nh->nh_dev);
+ nh->nh_scope = RT_SCOPE_HOST;
+ err = 0;
+ }
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static inline unsigned int fib_laddr_hashfn(__be32 val)
+{
+ unsigned int mask = (fib_info_hash_size - 1);
+
+ return ((__force u32)val ^
+ ((__force u32)val >> 7) ^
+ ((__force u32)val >> 14)) & mask;
+}
+
+static struct hlist_head *fib_info_hash_alloc(int bytes)
+{
+ if (bytes <= PAGE_SIZE)
+ return kzalloc(bytes, GFP_KERNEL);
+ else
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(bytes));
+}
+
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
+{
+ if (!hash)
+ return;
+
+ if (bytes <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long) hash, get_order(bytes));
+}
+
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
+{
+ struct hlist_head *old_info_hash, *old_laddrhash;
+ unsigned int old_size = fib_info_hash_size;
+ unsigned int i, bytes;
+
+ spin_lock_bh(&fib_info_lock);
+ old_info_hash = fib_info_hash;
+ old_laddrhash = fib_info_laddrhash;
+ fib_info_hash_size = new_size;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &fib_info_hash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, head, fib_hash) {
+ struct hlist_head *dest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_hash);
+
+ new_hash = fib_info_hashfn(fi);
+ dest = &new_info_hash[new_hash];
+ hlist_add_head(&fi->fib_hash, dest);
+ }
+ }
+ fib_info_hash = new_info_hash;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &fib_info_laddrhash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
+ struct hlist_head *ldest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_lhash);
+
+ new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+ ldest = &new_laddrhash[new_hash];
+ hlist_add_head(&fi->fib_lhash, ldest);
+ }
+ }
+ fib_info_laddrhash = new_laddrhash;
+
+ spin_unlock_bh(&fib_info_lock);
+
+ bytes = old_size * sizeof(struct hlist_head *);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
+}
+
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+{
+ nh->nh_saddr = inet_select_addr(nh->nh_dev,
+ nh->nh_gw,
+ nh->nh_parent->fib_scope);
+ nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+
+ return nh->nh_saddr;
+}
+
+struct fib_info *fib_create_info(struct fib_config *cfg)
+{
+ int err;
+ struct fib_info *fi = NULL;
+ struct fib_info *ofi;
+ int nhs = 1;
+ struct net *net = cfg->fc_nlinfo.nl_net;
+
+ if (cfg->fc_type > RTN_MAX)
+ goto err_inval;
+
+ /* Fast check to catch the most weird cases */
+ if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
+ goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (cfg->fc_mp) {
+ nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
+ if (nhs == 0)
+ goto err_inval;
+ }
+#endif
+
+ err = -ENOBUFS;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
+ struct hlist_head *new_info_hash;
+ struct hlist_head *new_laddrhash;
+ unsigned int bytes;
+
+ if (!new_size)
+ new_size = 16;
+ bytes = new_size * sizeof(struct hlist_head *);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
+ if (!new_info_hash || !new_laddrhash) {
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
+ } else
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
+
+ if (!fib_info_hash_size)
+ goto failure;
+ }
+
+ fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+ if (!fi)
+ goto failure;
+ fib_info_cnt++;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
+
+ fi->fib_net = net;
+ fi->fib_protocol = cfg->fc_protocol;
+ fi->fib_scope = cfg->fc_scope;
+ fi->fib_flags = cfg->fc_flags;
+ fi->fib_priority = cfg->fc_priority;
+ fi->fib_prefsrc = cfg->fc_prefsrc;
+ fi->fib_type = cfg->fc_type;
+
+ fi->fib_nhs = nhs;
+ change_nexthops(fi) {
+ nexthop_nh->nh_parent = fi;
+ nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+ if (!nexthop_nh->nh_pcpu_rth_output)
+ goto failure;
+ } endfor_nexthops(fi)
+
+ if (cfg->fc_mx) {
+ struct nlattr *nla;
+ int remaining;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+
+ if (type) {
+ u32 val;
+
+ if (type > RTAX_MAX)
+ goto err_inval;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ goto err_inval;
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ fi->fib_metrics[type - 1] = val;
+ }
+ }
+ }
+
+ if (cfg->fc_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
+ if (err != 0)
+ goto failure;
+ if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
+ goto err_inval;
+ if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
+ goto err_inval;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
+ goto err_inval;
+#endif
+#else
+ goto err_inval;
+#endif
+ } else {
+ struct fib_nh *nh = fi->fib_nh;
+
+ nh->nh_oif = cfg->fc_oif;
+ nh->nh_gw = cfg->fc_gw;
+ nh->nh_flags = cfg->fc_flags;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight = 1;
+#endif
+ }
+
+ if (fib_props[cfg->fc_type].error) {
+ if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
+ goto err_inval;
+ goto link_it;
+ } else {
+ switch (cfg->fc_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ break;
+ default:
+ goto err_inval;
+ }
+ }
+
+ if (cfg->fc_scope > RT_SCOPE_HOST)
+ goto err_inval;
+
+ if (cfg->fc_scope == RT_SCOPE_HOST) {
+ struct fib_nh *nh = fi->fib_nh;
+
+ /* Local address is added. */
+ if (nhs != 1 || nh->nh_gw)
+ goto err_inval;
+ nh->nh_scope = RT_SCOPE_NOWHERE;
+ nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
+ err = -ENODEV;
+ if (!nh->nh_dev)
+ goto failure;
+ } else {
+ change_nexthops(fi) {
+ err = fib_check_nh(cfg, fi, nexthop_nh);
+ if (err != 0)
+ goto failure;
+ } endfor_nexthops(fi)
+ }
+
+ if (fi->fib_prefsrc) {
+ if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+ fi->fib_prefsrc != cfg->fc_dst)
+ if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
+ goto err_inval;
+ }
+
+ change_nexthops(fi) {
+ fib_info_update_nh_saddr(net, nexthop_nh);
+ } endfor_nexthops(fi)
+
+link_it:
+ ofi = fib_find_info(fi);
+ if (ofi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ ofi->fib_treeref++;
+ return ofi;
+ }
+
+ fi->fib_treeref++;
+ atomic_inc(&fi->fib_clntref);
+ spin_lock_bh(&fib_info_lock);
+ hlist_add_head(&fi->fib_hash,
+ &fib_info_hash[fib_info_hashfn(fi)]);
+ if (fi->fib_prefsrc) {
+ struct hlist_head *head;
+
+ head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ hlist_add_head(&fi->fib_lhash, head);
+ }
+ change_nexthops(fi) {
+ struct hlist_head *head;
+ unsigned int hash;
+
+ if (!nexthop_nh->nh_dev)
+ continue;
+ hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nexthop_nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ spin_unlock_bh(&fib_info_lock);
+ return fi;
+
+err_inval:
+ err = -EINVAL;
+
+failure:
+ if (fi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ }
+
+ return ERR_PTR(err);
+}
+
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
+ struct fib_info *fi, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = dst_len;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = tos;
+ if (tb_id < 256)
+ rtm->rtm_table = tb_id;
+ else
+ rtm->rtm_table = RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
+ rtm->rtm_type = type;
+ rtm->rtm_flags = fi->fib_flags;
+ rtm->rtm_scope = fi->fib_scope;
+ rtm->rtm_protocol = fi->fib_protocol;
+
+ if (rtm->rtm_dst_len &&
+ nla_put_in_addr(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ goto nla_put_failure;
+
+ if (fi->fib_prefsrc &&
+ nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
+ if (fi->fib_nhs == 1) {
+ if (fi->fib_nh->nh_gw &&
+ nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ goto nla_put_failure;
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (fi->fib_nh[0].nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+ goto nla_put_failure;
+#endif
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_nhs > 1) {
+ struct rtnexthop *rtnh;
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ for_nexthops(fi) {
+ rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+ if (!rtnh)
+ goto nla_put_failure;
+
+ rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+ rtnh->rtnh_hops = nh->nh_weight - 1;
+ rtnh->rtnh_ifindex = nh->nh_oif;
+
+ if (nh->nh_gw &&
+ nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
+#endif
+ /* length of rtnetlink header + attributes */
+ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
+ } endfor_nexthops(fi);
+
+ nla_nest_end(skb, mp);
+ }
+#endif
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/*
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
+ */
+int fib_sync_down_addr(struct net *net, __be32 local)
+{
+ int ret = 0;
+ unsigned int hash = fib_laddr_hashfn(local);
+ struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct fib_info *fi;
+
+ if (!fib_info_laddrhash || local == 0)
+ return 0;
+
+ hlist_for_each_entry(fi, head, fib_lhash) {
+ if (!net_eq(fi->fib_net, net))
+ continue;
+ if (fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ return ret;
+}
+
+int fib_sync_down_dev(struct net_device *dev, int force)
+{
+ int ret = 0;
+ int scope = RT_SCOPE_NOWHERE;
+ struct fib_info *prev_fi = NULL;
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct fib_nh *nh;
+
+ if (force)
+ scope = -1;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int dead;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+ prev_fi = fi;
+ dead = 0;
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+ dead++;
+ else if (nexthop_nh->nh_dev == dev &&
+ nexthop_nh->nh_scope != scope) {
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ spin_lock_bh(&fib_multipath_lock);
+ fi->fib_power -= nexthop_nh->nh_power;
+ nexthop_nh->nh_power = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+#endif
+ dead++;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (force > 1 && nexthop_nh->nh_dev == dev) {
+ dead = fi->fib_nhs;
+ break;
+ }
+#endif
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct hlist_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (next_fi->fib_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (!fi) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || !fi) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
+ */
+int fib_sync_up(struct net_device *dev)
+{
+ struct fib_info *prev_fi;
+ unsigned int hash;
+ struct hlist_head *head;
+ struct fib_nh *nh;
+ int ret;
+
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ prev_fi = NULL;
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ ret = 0;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int alive;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+
+ prev_fi = fi;
+ alive = 0;
+ change_nexthops(fi) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ alive++;
+ continue;
+ }
+ if (!nexthop_nh->nh_dev ||
+ !(nexthop_nh->nh_dev->flags & IFF_UP))
+ continue;
+ if (nexthop_nh->nh_dev != dev ||
+ !__in_dev_get_rtnl(dev))
+ continue;
+ alive++;
+ spin_lock_bh(&fib_multipath_lock);
+ nexthop_nh->nh_power = 0;
+ nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
+ spin_unlock_bh(&fib_multipath_lock);
+ } endfor_nexthops(fi)
+
+ if (alive > 0) {
+ fi->fib_flags &= ~RTNH_F_DEAD;
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
+ */
+void fib_select_multipath(struct fib_result *res)
+{
+ struct fib_info *fi = res->fi;
+ int w;
+
+ spin_lock_bh(&fib_multipath_lock);
+ if (fi->fib_power <= 0) {
+ int power = 0;
+ change_nexthops(fi) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ power += nexthop_nh->nh_weight;
+ nexthop_nh->nh_power = nexthop_nh->nh_weight;
+ }
+ } endfor_nexthops(fi);
+ fi->fib_power = power;
+ if (power <= 0) {
+ spin_unlock_bh(&fib_multipath_lock);
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ return;
+ }
+ }
+
+
+ /* w should be random number [0..fi->fib_power-1],
+ * it is pretty bad approximation.
+ */
+
+ w = jiffies % fi->fib_power;
+
+ change_nexthops(fi) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
+ nexthop_nh->nh_power) {
+ w -= nexthop_nh->nh_power;
+ if (w <= 0) {
+ nexthop_nh->nh_power--;
+ fi->fib_power--;
+ res->nh_sel = nhsel;
+ spin_unlock_bh(&fib_multipath_lock);
+ return;
+ }
+ }
+ } endfor_nexthops(fi);
+
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000..09b62e17d
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2659 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
+ * & Swedish University of Agricultural Sciences.
+ *
+ * Jens Laas <jens.laas@data.slu.se> Swedish University of
+ * Agricultural Sciences.
+ *
+ * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
+ *
+ * This work is based on the LPC-trie which is originally described in:
+ *
+ * An experimental study of compression methods for dynamic tries
+ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
+ *
+ *
+ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ *
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ *
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ * David S. Miller, <davem@davemloft.net>
+ * Stephen Hemminger <shemminger@osdl.org>
+ * Paul E. McKenney <paulmck@us.ibm.com>
+ * Patrick McHardy <kaber@trash.net>
+ */
+
+#define VERSION "0.409"
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/switchdev.h>
+#include "fib_lookup.h"
+
+#define MAX_STAT_DEPTH 32
+
+#define KEYLENGTH (8*sizeof(t_key))
+#define KEY_MAX ((t_key)~0)
+
+typedef unsigned int t_key;
+
+#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
+
+struct key_vector {
+ t_key key;
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned char slen;
+ union {
+ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+ struct hlist_head leaf;
+ /* This array is valid if (pos | bits) > 0 (TNODE) */
+ struct key_vector __rcu *tnode[0];
+ };
+};
+
+struct tnode {
+ struct rcu_head rcu;
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
+ struct key_vector __rcu *parent;
+ struct key_vector kv[1];
+#define tn_bits kv[0].bits
+};
+
+#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE TNODE_SIZE(1)
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+struct trie_use_stats {
+ unsigned int gets;
+ unsigned int backtrack;
+ unsigned int semantic_match_passed;
+ unsigned int semantic_match_miss;
+ unsigned int null_node_hit;
+ unsigned int resize_node_skipped;
+};
+#endif
+
+struct trie_stat {
+ unsigned int totdepth;
+ unsigned int maxdepth;
+ unsigned int tnodes;
+ unsigned int leaves;
+ unsigned int nullpointers;
+ unsigned int prefixes;
+ unsigned int nodesizes[MAX_STAT_DEPTH];
+};
+
+struct trie {
+ struct key_vector kv[1];
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats;
+#endif
+};
+
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
+static size_t tnode_free_size;
+
+/*
+ * synchronize_rcu after call_rcu for that many pages; it should be especially
+ * useful before resizing the root node with PREEMPT_NONE configs; the value was
+ * obtained experimentally, aiming to avoid visible slowdown.
+ */
+static const int sync_pages = 128;
+
+static struct kmem_cache *fn_alias_kmem __read_mostly;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
+
+static inline struct tnode *tn_info(struct key_vector *kv)
+{
+ return container_of(kv, struct tnode, kv[0]);
+}
+
+/* caller must hold RTNL */
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
+
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
+
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
+{
+ if (n)
+ rcu_assign_pointer(tn_info(n)->parent, tp);
+}
+
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
+
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
+ */
+static inline unsigned long child_length(const struct key_vector *tn)
+{
+ return (1ul << tn->bits) & ~(1ul);
+}
+
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
+
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
+{
+ unsigned long index = key ^ kv->key;
+
+ if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+ return 0;
+
+ return index >> kv->pos;
+}
+
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * at this point.
+ */
+
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 30;
+
+static void __alias_free_mem(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+ call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+#define TNODE_KMALLOC_MAX \
+ ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+#define TNODE_VMALLOC_MAX \
+ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+
+static void __node_free_rcu(struct rcu_head *head)
+{
+ struct tnode *n = container_of(head, struct tnode, rcu);
+
+ if (!n->tn_bits)
+ kmem_cache_free(trie_leaf_kmem, n);
+ else if (n->tn_bits <= TNODE_KMALLOC_MAX)
+ kfree(n);
+ else
+ vfree(n);
+}
+
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
+
+static struct tnode *tnode_alloc(int bits)
+{
+ size_t size;
+
+ /* verify bits is within bounds */
+ if (bits > TNODE_VMALLOC_MAX)
+ return NULL;
+
+ /* determine size and verify it is non-zero and didn't overflow */
+ size = TNODE_SIZE(1ul << bits);
+
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, GFP_KERNEL);
+ else
+ return vzalloc(size);
+}
+
+static inline void empty_child_inc(struct key_vector *n)
+{
+ ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
+}
+
+static inline void empty_child_dec(struct key_vector *n)
+{
+ tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
+}
+
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
+{
+ struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ struct key_vector *l = kv->kv;
+
+ if (!kv)
+ return NULL;
+
+ /* initialize key vector */
+ l->key = key;
+ l->pos = 0;
+ l->bits = 0;
+ l->slen = fa->fa_slen;
+
+ /* link leaf to fib alias */
+ INIT_HLIST_HEAD(&l->leaf);
+ hlist_add_head(&fa->fa_list, &l->leaf);
+
+ return l;
+}
+
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
+{
+ struct tnode *tnode = tnode_alloc(bits);
+ unsigned int shift = pos + bits;
+ struct key_vector *tn = tnode->kv;
+
+ /* verify bits and pos their msb bits clear and values are valid */
+ BUG_ON(!bits || (shift > KEYLENGTH));
+
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
+ if (!tnode)
+ return NULL;
+
+ if (bits == KEYLENGTH)
+ tnode->full_children = 1;
+ else
+ tnode->empty_children = 1ul << bits;
+
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->slen = pos;
+
+ return tn;
+}
+
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
+ * and no bits are skipped. See discussion in dyntree paper p. 6
+ */
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
+{
+ return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
+}
+
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct key_vector *tn, unsigned long i,
+ struct key_vector *n)
+{
+ struct key_vector *chi = get_child(tn, i);
+ int isfull, wasfull;
+
+ BUG_ON(i >= child_length(tn));
+
+ /* update emptyChildren, overflow into fullChildren */
+ if (!n && chi)
+ empty_child_inc(tn);
+ if (n && !chi)
+ empty_child_dec(tn);
+
+ /* update fullChildren */
+ wasfull = tnode_full(tn, chi);
+ isfull = tnode_full(tn, n);
+
+ if (wasfull && !isfull)
+ tn_info(tn)->full_children--;
+ else if (!wasfull && isfull)
+ tn_info(tn)->full_children++;
+
+ if (n && (tn->slen < n->slen))
+ tn->slen = n->slen;
+
+ rcu_assign_pointer(tn->tnode[i], n);
+}
+
+static void update_children(struct key_vector *tn)
+{
+ unsigned long i;
+
+ /* update all of the child parent pointers */
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
+
+ if (!inode)
+ continue;
+
+ /* Either update the children of a tnode that
+ * already belongs to us or update the child
+ * to point to ourselves.
+ */
+ if (node_parent(inode) == tn)
+ update_children(inode);
+ else
+ node_set_parent(inode, tn);
+ }
+}
+
+static inline void put_child_root(struct key_vector *tp, t_key key,
+ struct key_vector *n)
+{
+ if (IS_TRIE(tp))
+ rcu_assign_pointer(tp->tnode[0], n);
+ else
+ put_child(tp, get_index(key, tp), n);
+}
+
+static inline void tnode_free_init(struct key_vector *tn)
+{
+ tn_info(tn)->rcu.next = NULL;
+}
+
+static inline void tnode_free_append(struct key_vector *tn,
+ struct key_vector *n)
+{
+ tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+ tn_info(tn)->rcu.next = &tn_info(n)->rcu;
+}
+
+static void tnode_free(struct key_vector *tn)
+{
+ struct callback_head *head = &tn_info(tn)->rcu;
+
+ while (head) {
+ head = head->next;
+ tnode_free_size += TNODE_SIZE(1ul << tn->bits);
+ node_free(tn);
+
+ tn = container_of(head, struct tnode, rcu)->kv;
+ }
+
+ if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+ tnode_free_size = 0;
+ synchronize_rcu();
+ }
+}
+
+static struct key_vector *replace(struct trie *t,
+ struct key_vector *oldtnode,
+ struct key_vector *tn)
+{
+ struct key_vector *tp = node_parent(oldtnode);
+ unsigned long i;
+
+ /* setup the parent pointer out of and back into this node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child_root(tp, tn->key, tn);
+
+ /* update all of the child parent pointers */
+ update_children(tn);
+
+ /* all pointers should be clean so we are done */
+ tnode_free(oldtnode);
+
+ /* resize children now that oldtnode is freed */
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
+
+ /* resize child node */
+ if (tnode_full(tn, inode))
+ tn = resize(t, inode);
+ }
+
+ return tp;
+}
+
+static struct key_vector *inflate(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *tn;
+ unsigned long i;
+ t_key m;
+
+ pr_debug("In inflate\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
+ if (!tn)
+ goto notnode;
+
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
+ */
+ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+ struct key_vector *inode = get_child(oldtnode, --i);
+ struct key_vector *node0, *node1;
+ unsigned long j, k;
+
+ /* An empty child */
+ if (!inode)
+ continue;
+
+ /* A leaf or an internal node with skipped bits */
+ if (!tnode_full(oldtnode, inode)) {
+ put_child(tn, get_index(inode->key, tn), inode);
+ continue;
+ }
+
+ /* drop the node in the old tnode free list */
+ tnode_free_append(oldtnode, inode);
+
+ /* An internal node with two children */
+ if (inode->bits == 1) {
+ put_child(tn, 2 * i + 1, get_child(inode, 1));
+ put_child(tn, 2 * i, get_child(inode, 0));
+ continue;
+ }
+
+ /* We will replace this node 'inode' with two new
+ * ones, 'node0' and 'node1', each with half of the
+ * original children. The two new nodes will have
+ * a position one bit further down the key and this
+ * means that the "significant" part of their keys
+ * (see the discussion near the top of this file)
+ * will differ by one bit, which will be "0" in
+ * node0's key and "1" in node1's key. Since we are
+ * moving the key position by one step, the bit that
+ * we are moving away from - the bit at position
+ * (tn->pos) - is the one that will differ between
+ * node0 and node1. So... we synthesize that bit in the
+ * two new keys.
+ */
+ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+ if (!node1)
+ goto nomem;
+ node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
+
+ tnode_free_append(tn, node1);
+ if (!node0)
+ goto nomem;
+ tnode_free_append(tn, node0);
+
+ /* populate child pointers in new nodes */
+ for (k = child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ }
+
+ /* link new nodes to parent */
+ NODE_INIT_PARENT(node1, tn);
+ NODE_INIT_PARENT(node0, tn);
+
+ /* link parent to nodes */
+ put_child(tn, 2 * i + 1, node1);
+ put_child(tn, 2 * i, node0);
+ }
+
+ /* setup the parent pointers into and out of this node */
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
+}
+
+static struct key_vector *halve(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *tn;
+ unsigned long i;
+
+ pr_debug("In halve\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
+ if (!tn)
+ goto notnode;
+
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
+ */
+ for (i = child_length(oldtnode); i;) {
+ struct key_vector *node1 = get_child(oldtnode, --i);
+ struct key_vector *node0 = get_child(oldtnode, --i);
+ struct key_vector *inode;
+
+ /* At least one of the children is empty */
+ if (!node1 || !node0) {
+ put_child(tn, i / 2, node1 ? : node0);
+ continue;
+ }
+
+ /* Two nonempty children */
+ inode = tnode_new(node0->key, oldtnode->pos, 1);
+ if (!inode)
+ goto nomem;
+ tnode_free_append(tn, inode);
+
+ /* initialize pointers out of node */
+ put_child(inode, 1, node1);
+ put_child(inode, 0, node0);
+ NODE_INIT_PARENT(inode, tn);
+
+ /* link parent to node */
+ put_child(tn, i / 2, inode);
+ }
+
+ /* setup the parent pointers into and out of this node */
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
+}
+
+static struct key_vector *collapse(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *n, *tp;
+ unsigned long i;
+
+ /* scan the tnode looking for that one child that might still exist */
+ for (n = NULL, i = child_length(oldtnode); !n && i;)
+ n = get_child(oldtnode, --i);
+
+ /* compress one level */
+ tp = node_parent(oldtnode);
+ put_child_root(tp, oldtnode->key, n);
+ node_set_parent(n, tp);
+
+ /* drop dead node */
+ node_free(oldtnode);
+
+ return tp;
+}
+
+static unsigned char update_suffix(struct key_vector *tn)
+{
+ unsigned char slen = tn->pos;
+ unsigned long stride, i;
+
+ /* search though the list of children looking for nodes that might
+ * have a suffix greater than the one we currently have. This is
+ * why we start with a stride of 2 since a stride of 1 would
+ * represent the nodes with suffix length equal to tn->pos
+ */
+ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+ struct key_vector *n = get_child(tn, i);
+
+ if (!n || (n->slen <= slen))
+ continue;
+
+ /* update stride and slen based on new value */
+ stride <<= (n->slen - slen);
+ slen = n->slen;
+ i &= ~(stride - 1);
+
+ /* if slen covers all but the last bit we can stop here
+ * there will be nothing longer than that since only node
+ * 0 and 1 << (bits - 1) could have that as their suffix
+ * length.
+ */
+ if ((slen + 1) >= (tn->pos + tn->bits))
+ break;
+ }
+
+ tn->slen = slen;
+
+ return slen;
+}
+
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
+ * tn->full_children;
+ *
+ * new_child_length = child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ * new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ * inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ * inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (child_length(tn) - tn->empty_children +
+ * tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (child_length(tn) - tn->empty_children +
+ * tn->full_children) >=
+ * inflate_threshold * child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + child_length(tn) -
+ * tn->empty_children) >= inflate_threshold *
+ * child_length(tn)
+ *
+ */
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+ unsigned long threshold = used;
+
+ /* Keep root node larger */
+ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+ used -= tn_info(tn)->empty_children;
+ used += tn_info(tn)->full_children;
+
+ /* if bits == KEYLENGTH then pos = 0, and will fail below */
+
+ return (used > 1) && tn->pos && ((50 * used) >= threshold);
+}
+
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+ unsigned long threshold = used;
+
+ /* Keep root node larger */
+ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+ used -= tn_info(tn)->empty_children;
+
+ /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
+
+ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+}
+
+static inline bool should_collapse(struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+
+ used -= tn_info(tn)->empty_children;
+
+ /* account for bits == KEYLENGTH case */
+ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
+ used -= KEY_MAX;
+
+ /* One child or none, time to drop us from the trie */
+ return used < 2;
+}
+
+#define MAX_WORK 10
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
+{
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ struct key_vector *tp = node_parent(tn);
+ unsigned long cindex = get_index(tn->key, tp);
+ int max_work = MAX_WORK;
+
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
+
+ /* track the tnode via the pointer from the parent instead of
+ * doing it ourselves. This way we can let RCU fully do its
+ * thing without us interfering
+ */
+ BUG_ON(tn != get_child(tp, cindex));
+
+ /* Double as long as the resulting node has a number of
+ * nonempty nodes that are above the threshold.
+ */
+ while (should_inflate(tp, tn) && max_work) {
+ tp = inflate(t, tn);
+ if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->resize_node_skipped);
+#endif
+ break;
+ }
+
+ max_work--;
+ tn = get_child(tp, cindex);
+ }
+
+ /* update parent in case inflate failed */
+ tp = node_parent(tn);
+
+ /* Return if at least one inflate is run */
+ if (max_work != MAX_WORK)
+ return tp;
+
+ /* Halve as long as the number of empty children in this
+ * node is above threshold.
+ */
+ while (should_halve(tp, tn) && max_work) {
+ tp = halve(t, tn);
+ if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->resize_node_skipped);
+#endif
+ break;
+ }
+
+ max_work--;
+ tn = get_child(tp, cindex);
+ }
+
+ /* Only one child remains */
+ if (should_collapse(tn))
+ return collapse(t, tn);
+
+ /* update parent in case halve failed */
+ tp = node_parent(tn);
+
+ /* Return if at least one deflate was run */
+ if (max_work != MAX_WORK)
+ return tp;
+
+ /* push the suffix length to the parent node */
+ if (tn->slen > tn->pos) {
+ unsigned char slen = update_suffix(tn);
+
+ if (slen > tp->slen)
+ tp->slen = slen;
+ }
+
+ return tp;
+}
+
+static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
+{
+ while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
+ if (update_suffix(tp) > l->slen)
+ break;
+ tp = node_parent(tp);
+ }
+}
+
+static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
+{
+ /* if this is a new leaf then tn will be NULL and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ while (tn->slen < l->slen) {
+ tn->slen = l->slen;
+ tn = node_parent(tn);
+ }
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+ struct key_vector **tp, u32 key)
+{
+ struct key_vector *pn, *n = t->kv;
+ unsigned long index = 0;
+
+ do {
+ pn = n;
+ n = get_child_rcu(n, index);
+
+ if (!n)
+ break;
+
+ index = get_cindex(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the bits in the cindex. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if (index >= (1ul << bits))
+ * we have a mismatch in skip bits and failed
+ * else
+ * we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
+ */
+ if (index >= (1ul << n->bits)) {
+ n = NULL;
+ break;
+ }
+
+ /* keep searching until we find a perfect match leaf or NULL */
+ } while (IS_TNODE(n));
+
+ *tp = pn;
+
+ return n;
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+ u8 tos, u32 prio, u32 tb_id)
+{
+ struct fib_alias *fa;
+
+ if (!fah)
+ return NULL;
+
+ hlist_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_slen < slen)
+ continue;
+ if (fa->fa_slen != slen)
+ break;
+ if (fa->tb_id > tb_id)
+ continue;
+ if (fa->tb_id != tb_id)
+ break;
+ if (fa->fa_tos > tos)
+ continue;
+ if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ return fa;
+ }
+
+ return NULL;
+}
+
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
+{
+ while (!IS_TRIE(tn))
+ tn = resize(t, tn);
+}
+
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+ struct fib_alias *new, t_key key)
+{
+ struct key_vector *n, *l;
+
+ l = leaf_new(key, new);
+ if (!l)
+ goto noleaf;
+
+ /* retrieve child from parent node */
+ n = get_child(tp, get_index(key, tp));
+
+ /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+ *
+ * Add a new tnode here
+ * first tnode need some special handling
+ * leaves us in position for handling as case 3
+ */
+ if (n) {
+ struct key_vector *tn;
+
+ tn = tnode_new(key, __fls(key ^ n->key), 1);
+ if (!tn)
+ goto notnode;
+
+ /* initialize routes out of node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child(tn, get_index(key, tn) ^ 1, n);
+
+ /* start adding routes into the node */
+ put_child_root(tp, key, tn);
+ node_set_parent(n, tn);
+
+ /* parent now has a NULL spot where the leaf can go */
+ tp = tn;
+ }
+
+ /* Case 3: n is NULL, and will just insert a new leaf */
+ NODE_INIT_PARENT(l, tp);
+ put_child_root(tp, key, l);
+ trie_rebalance(t, tp);
+
+ return 0;
+notnode:
+ node_free(l);
+noleaf:
+ return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *new,
+ struct fib_alias *fa, t_key key)
+{
+ if (!l)
+ return fib_insert_node(t, tp, new, key);
+
+ if (fa) {
+ hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
+ } else {
+ struct fib_alias *last;
+
+ hlist_for_each_entry(last, &l->leaf, fa_list) {
+ if (new->fa_slen < last->fa_slen)
+ break;
+ if ((new->fa_slen == last->fa_slen) &&
+ (new->tb_id > last->tb_id))
+ break;
+ fa = last;
+ }
+
+ if (fa)
+ hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+ else
+ hlist_add_head_rcu(&new->fa_list, &l->leaf);
+ }
+
+ /* if we added to the tail node then we need to update slen */
+ if (l->slen < new->fa_slen) {
+ l->slen = new->fa_slen;
+ leaf_push_suffix(tp, l);
+ }
+
+ return 0;
+}
+
+/* Caller must hold RTNL. */
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct fib_alias *fa, *new_fa;
+ struct key_vector *l, *tp;
+ struct fib_info *fi;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
+ int err;
+
+ if (plen > KEYLENGTH)
+ return -EINVAL;
+
+ key = ntohl(cfg->fc_dst);
+
+ pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
+
+ if ((plen < KEYLENGTH) && (key << plen))
+ return -EINVAL;
+
+ fi = fib_create_info(cfg);
+ if (IS_ERR(fi)) {
+ err = PTR_ERR(fi);
+ goto err;
+ }
+
+ l = fib_find_node(t, &tp, key);
+ fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ tb->tb_id) : NULL;
+
+ /* Now fa, if non-NULL, points to the first fib alias
+ * with the same keys [prefix,tos,priority], if such key already
+ * exists or to the node before which we will insert new one.
+ *
+ * If fa is NULL, we will need to allocate a new one and
+ * insert to the tail of the section matching the suffix length
+ * of the new alias.
+ */
+
+ if (fa && fa->fa_tos == tos &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_first, *fa_match;
+
+ err = -EEXIST;
+ if (cfg->fc_nlflags & NLM_F_EXCL)
+ goto out;
+
+ /* We have 2 goals:
+ * 1. Find exact match for type, scope, fib_info to avoid
+ * duplicate routes
+ * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+ */
+ fa_match = NULL;
+ fa_first = fa;
+ hlist_for_each_entry_from(fa, fa_list) {
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == cfg->fc_type &&
+ fa->fa_info == fi) {
+ fa_match = fa;
+ break;
+ }
+ }
+
+ if (cfg->fc_nlflags & NLM_F_REPLACE) {
+ struct fib_info *fi_drop;
+ u8 state;
+
+ fa = fa_first;
+ if (fa_match) {
+ if (fa == fa_match)
+ err = 0;
+ goto out;
+ }
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ fi_drop = fa->fa_info;
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = cfg->fc_type;
+ state = fa->fa_state;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
+ new_fa->fa_slen = fa->fa_slen;
+ new_fa->tb_id = tb->tb_id;
+
+ err = netdev_switch_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ kmem_cache_free(fn_alias_kmem, new_fa);
+ goto out;
+ }
+
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+ alias_free_mem_rcu(fa);
+
+ fib_release_info(fi_drop);
+ if (state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+
+ goto succeeded;
+ }
+ /* Error if we find a perfect match which
+ * uses the same scope, type, and nexthop
+ * information.
+ */
+ if (fa_match)
+ goto out;
+
+ if (!(cfg->fc_nlflags & NLM_F_APPEND))
+ fa = fa_first;
+ }
+ err = -ENOENT;
+ if (!(cfg->fc_nlflags & NLM_F_CREATE))
+ goto out;
+
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ new_fa->fa_info = fi;
+ new_fa->fa_tos = tos;
+ new_fa->fa_type = cfg->fc_type;
+ new_fa->fa_state = 0;
+ new_fa->fa_slen = slen;
+ new_fa->tb_id = tb->tb_id;
+
+ /* (Optionally) offload fib entry to switch hardware. */
+ err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ goto out_free_new_fa;
+ }
+
+ /* Insert new entry to the list. */
+ err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+ if (err)
+ goto out_sw_fib_del;
+
+ if (!plen)
+ tb->tb_num_default++;
+
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
+ &cfg->fc_nlinfo, 0);
+succeeded:
+ return 0;
+
+out_sw_fib_del:
+ netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
+out_free_new_fa:
+ kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+ fib_release_info(fi);
+err:
+ return err;
+}
+
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
+{
+ t_key prefix = n->key;
+
+ return (key ^ prefix) & (prefix | -prefix);
+}
+
+/* should be called with rcu_read_lock */
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ const t_key key = ntohl(flp->daddr);
+ struct key_vector *n, *pn;
+ struct fib_alias *fa;
+ unsigned long index;
+ t_key cindex;
+
+ pn = t->kv;
+ cindex = 0;
+
+ n = get_child_rcu(pn, cindex);
+ if (!n)
+ return -EAGAIN;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->gets);
+#endif
+
+ /* Step 1: Travel to the longest prefix match in the trie */
+ for (;;) {
+ index = get_cindex(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if (index >= (1ul << bits))
+ * we have a mismatch in skip bits and failed
+ * else
+ * we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
+ */
+ if (index >= (1ul << n->bits))
+ break;
+
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* only record pn and cindex if we are going to be chopping
+ * bits later. Otherwise we are just wasting cycles.
+ */
+ if (n->slen > n->pos) {
+ pn = n;
+ cindex = index;
+ }
+
+ n = get_child_rcu(n, index);
+ if (unlikely(!n))
+ goto backtrace;
+ }
+
+ /* Step 2: Sort out leaves and begin backtracing for longest prefix */
+ for (;;) {
+ /* record the pointer where our next node pointer is stored */
+ struct key_vector __rcu **cptr = n->tnode;
+
+ /* This test verifies that none of the bits that differ
+ * between the key and the prefix exist in the region of
+ * the lsb and higher in the prefix.
+ */
+ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
+ goto backtrace;
+
+ /* exit out and process leaf */
+ if (unlikely(IS_LEAF(n)))
+ break;
+
+ /* Don't bother recording parent info. Since we are in
+ * prefix match mode we will have to come back to wherever
+ * we started this traversal anyway
+ */
+
+ while ((n = rcu_dereference(*cptr)) == NULL) {
+backtrace:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ if (!n)
+ this_cpu_inc(stats->null_node_hit);
+#endif
+ /* If we are at cindex 0 there are no more bits for
+ * us to strip at this level so we must ascend back
+ * up one level to see if there are any more bits to
+ * be stripped there.
+ */
+ while (!cindex) {
+ t_key pkey = pn->key;
+
+ /* If we don't have a parent then there is
+ * nothing for us to do as we do not have any
+ * further nodes to parse.
+ */
+ if (IS_TRIE(pn))
+ return -EAGAIN;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->backtrack);
+#endif
+ /* Get Child's index */
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn);
+ }
+
+ /* strip the least significant bit from the cindex */
+ cindex &= cindex - 1;
+
+ /* grab pointer for next child node */
+ cptr = &pn->tnode[cindex];
+ }
+ }
+
+found:
+ /* this line carries forward the xor from earlier in the function */
+ index = key ^ n->key;
+
+ /* Step 3: Process the leaf, if that fails fall back to backtracing */
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+
+ if ((index >= (1ul << fa->fa_slen)) &&
+ ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
+ continue;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+
+ res->prefixlen = KEYLENGTH - fa->fa_slen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &n->leaf;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ }
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_miss);
+#endif
+ goto backtrace;
+}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
+
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old)
+{
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->fa_list.pprev;
+ struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
+
+ /* remove the fib_alias from the list */
+ hlist_del_rcu(&old->fa_list);
+
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ if (hlist_empty(&l->leaf)) {
+ put_child_root(tp, l->key, NULL);
+ node_free(l);
+ trie_rebalance(t, tp);
+ return;
+ }
+
+ /* only access fa if it is pointing at the last valid hlist_node */
+ if (*pprev)
+ return;
+
+ /* update the trie with the latest suffix length */
+ l->slen = fa->fa_slen;
+ leaf_pull_suffix(tp, l);
+}
+
+/* Caller must hold RTNL. */
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct fib_alias *fa, *fa_to_delete;
+ struct key_vector *l, *tp;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
+
+ if (plen > KEYLENGTH)
+ return -EINVAL;
+
+ key = ntohl(cfg->fc_dst);
+
+ if ((plen < KEYLENGTH) && (key << plen))
+ return -EINVAL;
+
+ l = fib_find_node(t, &tp, key);
+ if (!l)
+ return -ESRCH;
+
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
+ if (!fa)
+ return -ESRCH;
+
+ pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+
+ fa_to_delete = NULL;
+ hlist_for_each_entry_from(fa, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
+ break;
+
+ if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
+ (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+ fa->fa_info->fib_scope == cfg->fc_scope) &&
+ (!cfg->fc_prefsrc ||
+ fi->fib_prefsrc == cfg->fc_prefsrc) &&
+ (!cfg->fc_protocol ||
+ fi->fib_protocol == cfg->fc_protocol) &&
+ fib_nh_match(cfg, fi) == 0) {
+ fa_to_delete = fa;
+ break;
+ }
+ }
+
+ if (!fa_to_delete)
+ return -ESRCH;
+
+ netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
+
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
+
+ if (!plen)
+ tb->tb_num_default--;
+
+ fib_remove_alias(t, tp, l, fa_to_delete);
+
+ if (fa_to_delete->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ fib_release_info(fa_to_delete->fa_info);
+ alias_free_mem_rcu(fa_to_delete);
+ return 0;
+}
+
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
+{
+ struct key_vector *pn, *n = *tn;
+ unsigned long cindex;
+
+ /* this loop is meant to try and find the key in the trie */
+ do {
+ /* record parent and next child index */
+ pn = n;
+ cindex = key ? get_index(key, pn) : 0;
+
+ if (cindex >> pn->bits)
+ break;
+
+ /* descend into the next child */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ break;
+
+ /* guarantee forward progress on the keys */
+ if (IS_LEAF(n) && (n->key >= key))
+ goto found;
+ } while (IS_TNODE(n));
+
+ /* this loop will search for the next leaf with a greater key */
+ while (!IS_TRIE(pn)) {
+ /* if we exhausted the parent node we will need to climb */
+ if (cindex >= (1ul << pn->bits)) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ continue;
+
+ /* no need to compare keys since we bumped the index */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* Rescan start scanning in new node */
+ pn = n;
+ cindex = 0;
+ }
+
+ *tn = pn;
+ return NULL; /* Root of trie */
+found:
+ /* if we are at the limit for keys just return NULL for the tnode */
+ *tn = pn;
+ return n;
+}
+
+static void fib_trie_free(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order and free everything */
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ n = pn;
+ pn = node_parent(pn);
+
+ /* drop emptied tnode */
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ }
+
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ free_percpu(t->stats);
+#endif
+ kfree(tb);
+}
+
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
+{
+ struct trie *ot = (struct trie *)oldtb->tb_data;
+ struct key_vector *l, *tp = ot->kv;
+ struct fib_table *local_tb;
+ struct fib_alias *fa;
+ struct trie *lt;
+ t_key key = 0;
+
+ if (oldtb->tb_data == oldtb->__data)
+ return oldtb;
+
+ local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+ if (!local_tb)
+ return NULL;
+
+ lt = (struct trie *)local_tb->tb_data;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ struct key_vector *local_l = NULL, *local_tp;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_alias *new_fa;
+
+ if (local_tb->tb_id != fa->tb_id)
+ continue;
+
+ /* clone fa for new local table */
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ memcpy(new_fa, fa, sizeof(*fa));
+
+ /* insert clone into table */
+ if (!local_l)
+ local_l = fib_find_node(lt, &local_tp, l->key);
+
+ if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+ NULL, l->key))
+ goto out;
+ }
+
+ /* stop loop if key wrapped back to 0 */
+ key = l->key + 1;
+ if (key < l->key)
+ break;
+ }
+
+ return local_tb;
+out:
+ fib_trie_free(local_tb);
+
+ return NULL;
+}
+
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
+ }
+
+ /* record local slen */
+ slen = fa->fa_slen;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
+ continue;
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
+ }
+}
+
+/* Caller must hold RTNL. */
+int fib_table_flush(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+ int found = 0;
+
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ hlist_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
+ }
+
+ pr_debug("trie_flush found=%d\n", found);
+ return found;
+}
+
+static void __trie_free_rcu(struct rcu_head *head)
+{
+ struct fib_table *tb = container_of(head, struct fib_table, rcu);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie *t = (struct trie *)tb->tb_data;
+
+ if (tb->tb_data == tb->__data)
+ free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+ kfree(tb);
+}
+
+void fib_free_table(struct fib_table *tb)
+{
+ call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ __be32 xkey = htonl(l->key);
+ struct fib_alias *fa;
+ int i, s_i;
+
+ s_i = cb->args[4];
+ i = 0;
+
+ /* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ if (i < s_i) {
+ i++;
+ continue;
+ }
+
+ if (tb->tb_id != fa->tb_id) {
+ i++;
+ continue;
+ }
+
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id,
+ fa->fa_type,
+ xkey,
+ KEYLENGTH - fa->fa_slen,
+ fa->fa_tos,
+ fa->fa_info, NLM_F_MULTI) < 0) {
+ cb->args[4] = i;
+ return -1;
+ }
+ i++;
+ }
+
+ cb->args[4] = i;
+ return skb->len;
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ /* Dump starting at last key.
+ * Note: 0.0.0.0/0 (ie default) is first key.
+ */
+ int count = cb->args[2];
+ t_key key = cb->args[3];
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ cb->args[3] = key;
+ cb->args[2] = count;
+ return -1;
+ }
+
+ ++count;
+ key = l->key + 1;
+
+ memset(&cb->args[4], 0,
+ sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+ /* stop loop if key wrapped back to 0 */
+ if (key < l->key)
+ break;
+ }
+
+ cb->args[3] = key;
+ cb->args[2] = count;
+
+ return skb->len;
+}
+
+void __init fib_trie_init(void)
+{
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_PANIC, NULL);
+
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ LEAF_SIZE,
+ 0, SLAB_PANIC, NULL);
+}
+
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
+{
+ struct fib_table *tb;
+ struct trie *t;
+ size_t sz = sizeof(*tb);
+
+ if (!alias)
+ sz += sizeof(struct trie);
+
+ tb = kzalloc(sz, GFP_KERNEL);
+ if (!tb)
+ return NULL;
+
+ tb->tb_id = id;
+ tb->tb_default = -1;
+ tb->tb_num_default = 0;
+ tb->tb_data = (alias ? alias->__data : tb->__data);
+
+ if (alias)
+ return tb;
+
+ t = (struct trie *) tb->tb_data;
+ t->kv[0].pos = KEYLENGTH;
+ t->kv[0].slen = KEYLENGTH;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats = alloc_percpu(struct trie_use_stats);
+ if (!t->stats) {
+ kfree(tb);
+ tb = NULL;
+ }
+#endif
+
+ return tb;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+ struct seq_net_private p;
+ struct fib_table *tb;
+ struct key_vector *tnode;
+ unsigned int index;
+ unsigned int depth;
+};
+
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
+{
+ unsigned long cindex = iter->index;
+ struct key_vector *pn = iter->tnode;
+ t_key pkey;
+
+ pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+ iter->tnode, iter->index, iter->depth);
+
+ while (!IS_TRIE(pn)) {
+ while (cindex < child_length(pn)) {
+ struct key_vector *n = get_child_rcu(pn, cindex++);
+
+ if (!n)
+ continue;
+
+ if (IS_LEAF(n)) {
+ iter->tnode = pn;
+ iter->index = cindex;
+ } else {
+ /* push down one level */
+ iter->tnode = n;
+ iter->index = 0;
+ ++iter->depth;
+ }
+
+ return n;
+ }
+
+ /* Current node exhausted, pop back up */
+ pkey = pn->key;
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ --iter->depth;
+ }
+
+ /* record root node so further searches know we are done */
+ iter->tnode = pn;
+ iter->index = 0;
+
+ return NULL;
+}
+
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
+{
+ struct key_vector *n, *pn = t->kv;
+
+ if (!t)
+ return NULL;
+
+ n = rcu_dereference(pn->tnode[0]);
+ if (!n)
+ return NULL;
+
+ if (IS_TNODE(n)) {
+ iter->tnode = n;
+ iter->index = 0;
+ iter->depth = 1;
+ } else {
+ iter->tnode = pn;
+ iter->index = 0;
+ iter->depth = 0;
+ }
+
+ return n;
+}
+
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
+{
+ struct key_vector *n;
+ struct fib_trie_iter iter;
+
+ memset(s, 0, sizeof(*s));
+
+ rcu_read_lock();
+ for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
+ if (IS_LEAF(n)) {
+ struct fib_alias *fa;
+
+ s->leaves++;
+ s->totdepth += iter.depth;
+ if (iter.depth > s->maxdepth)
+ s->maxdepth = iter.depth;
+
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
+ ++s->prefixes;
+ } else {
+ s->tnodes++;
+ if (n->bits < MAX_STAT_DEPTH)
+ s->nodesizes[n->bits]++;
+ s->nullpointers += tn_info(n)->empty_children;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This outputs /proc/net/fib_triestats
+ */
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
+{
+ unsigned int i, max, pointers, bytes, avdepth;
+
+ if (stat->leaves)
+ avdepth = stat->totdepth*100 / stat->leaves;
+ else
+ avdepth = 0;
+
+ seq_printf(seq, "\tAver depth: %u.%02d\n",
+ avdepth / 100, avdepth % 100);
+ seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
+
+ seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
+ bytes = LEAF_SIZE * stat->leaves;
+
+ seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
+ bytes += sizeof(struct fib_alias) * stat->prefixes;
+
+ seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
+ bytes += TNODE_SIZE(0) * stat->tnodes;
+
+ max = MAX_STAT_DEPTH;
+ while (max > 0 && stat->nodesizes[max-1] == 0)
+ max--;
+
+ pointers = 0;
+ for (i = 1; i < max; i++)
+ if (stat->nodesizes[i] != 0) {
+ seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
+ pointers += (1<<i) * stat->nodesizes[i];
+ }
+ seq_putc(seq, '\n');
+ seq_printf(seq, "\tPointers: %u\n", pointers);
+
+ bytes += sizeof(struct key_vector *) * pointers;
+ seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
+}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+static void trie_show_usage(struct seq_file *seq,
+ const struct trie_use_stats __percpu *stats)
+{
+ struct trie_use_stats s = { 0 };
+ int cpu;
+
+ /* loop through all of the CPUs and gather up the stats */
+ for_each_possible_cpu(cpu) {
+ const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+ s.gets += pcpu->gets;
+ s.backtrack += pcpu->backtrack;
+ s.semantic_match_passed += pcpu->semantic_match_passed;
+ s.semantic_match_miss += pcpu->semantic_match_miss;
+ s.null_node_hit += pcpu->null_node_hit;
+ s.resize_node_skipped += pcpu->resize_node_skipped;
+ }
+
+ seq_printf(seq, "\nCounters:\n---------\n");
+ seq_printf(seq, "gets = %u\n", s.gets);
+ seq_printf(seq, "backtracks = %u\n", s.backtrack);
+ seq_printf(seq, "semantic match passed = %u\n",
+ s.semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
+}
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
+{
+ if (tb->tb_id == RT_TABLE_LOCAL)
+ seq_puts(seq, "Local:\n");
+ else if (tb->tb_id == RT_TABLE_MAIN)
+ seq_puts(seq, "Main:\n");
+ else
+ seq_printf(seq, "Id %d:\n", tb->tb_id);
+}
+
+
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = (struct net *)seq->private;
+ unsigned int h;
+
+ seq_printf(seq,
+ "Basic info: size of leaf:"
+ " %Zd bytes, size of tnode: %Zd bytes.\n",
+ LEAF_SIZE, TNODE_SIZE(0));
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct trie_stat stat;
+
+ if (!t)
+ continue;
+
+ fib_table_print(seq, tb);
+
+ trie_collect_stats(t, &stat);
+ trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ trie_show_usage(seq, t->stats);
+#endif
+ }
+ }
+
+ return 0;
+}
+
+static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, fib_triestat_seq_show);
+}
+
+static const struct file_operations fib_triestat_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_triestat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct fib_trie_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ loff_t idx = 0;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct key_vector *n;
+
+ for (n = fib_trie_get_first(iter,
+ (struct trie *) tb->tb_data);
+ n; n = fib_trie_get_next(iter))
+ if (pos == idx++) {
+ iter->tb = tb;
+ return n;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return fib_trie_get_idx(seq, *pos);
+}
+
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct fib_trie_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct fib_table *tb = iter->tb;
+ struct hlist_node *tb_node;
+ unsigned int h;
+ struct key_vector *n;
+
+ ++*pos;
+ /* next node in same table */
+ n = fib_trie_get_next(iter);
+ if (n)
+ return n;
+
+ /* walk rest of this hash chain */
+ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
+ while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
+ tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+
+ /* new hash chain */
+ while (++h < FIB_TABLE_HASHSZ) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+ }
+ return NULL;
+
+found:
+ iter->tb = tb;
+ return n;
+}
+
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void seq_indent(struct seq_file *seq, int n)
+{
+ while (n-- > 0)
+ seq_puts(seq, " ");
+}
+
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
+{
+ switch (s) {
+ case RT_SCOPE_UNIVERSE: return "universe";
+ case RT_SCOPE_SITE: return "site";
+ case RT_SCOPE_LINK: return "link";
+ case RT_SCOPE_HOST: return "host";
+ case RT_SCOPE_NOWHERE: return "nowhere";
+ default:
+ snprintf(buf, len, "scope=%d", s);
+ return buf;
+ }
+}
+
+static const char *const rtn_type_names[__RTN_MAX] = {
+ [RTN_UNSPEC] = "UNSPEC",
+ [RTN_UNICAST] = "UNICAST",
+ [RTN_LOCAL] = "LOCAL",
+ [RTN_BROADCAST] = "BROADCAST",
+ [RTN_ANYCAST] = "ANYCAST",
+ [RTN_MULTICAST] = "MULTICAST",
+ [RTN_BLACKHOLE] = "BLACKHOLE",
+ [RTN_UNREACHABLE] = "UNREACHABLE",
+ [RTN_PROHIBIT] = "PROHIBIT",
+ [RTN_THROW] = "THROW",
+ [RTN_NAT] = "NAT",
+ [RTN_XRESOLVE] = "XRESOLVE",
+};
+
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
+{
+ if (t < __RTN_MAX && rtn_type_names[t])
+ return rtn_type_names[t];
+ snprintf(buf, len, "type %u", t);
+ return buf;
+}
+
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+ const struct fib_trie_iter *iter = seq->private;
+ struct key_vector *n = v;
+
+ if (IS_TRIE(node_parent_rcu(n)))
+ fib_table_print(seq, iter->tb);
+
+ if (IS_TNODE(n)) {
+ __be32 prf = htonl(n->key);
+
+ seq_indent(seq, iter->depth-1);
+ seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
+ &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+ tn_info(n)->full_children,
+ tn_info(n)->empty_children);
+ } else {
+ __be32 val = htonl(n->key);
+ struct fib_alias *fa;
+
+ seq_indent(seq, iter->depth);
+ seq_printf(seq, " |-- %pI4\n", &val);
+
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth + 1);
+ seq_printf(seq, " /%zu %s %s",
+ KEYLENGTH - fa->fa_slen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, " tos=%d", fa->fa_tos);
+ seq_putc(seq, '\n');
+ }
+ }
+
+ return 0;
+}
+
+static const struct seq_operations fib_trie_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &fib_trie_seq_ops,
+ sizeof(struct fib_trie_iter));
+}
+
+static const struct file_operations fib_trie_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_trie_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct fib_route_iter {
+ struct seq_net_private p;
+ struct fib_table *main_tb;
+ struct key_vector *tnode;
+ loff_t pos;
+ t_key key;
+};
+
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+ loff_t pos)
+{
+ struct fib_table *tb = iter->main_tb;
+ struct key_vector *l, **tp = &iter->tnode;
+ struct trie *t;
+ t_key key;
+
+ /* use cache location of next-to-find key */
+ if (iter->pos > 0 && pos >= iter->pos) {
+ pos -= iter->pos;
+ key = iter->key;
+ } else {
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ key = 0;
+ }
+
+ while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ key = l->key + 1;
+ iter->pos++;
+
+ if (pos-- <= 0)
+ break;
+
+ l = NULL;
+
+ /* handle unlikely case of a key wrap */
+ if (!key)
+ break;
+ }
+
+ if (l)
+ iter->key = key; /* remember it */
+ else
+ iter->pos = 0; /* forget it */
+
+ return l;
+}
+
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb;
+ struct trie *t;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+ if (!tb)
+ return NULL;
+
+ iter->main_tb = tb;
+
+ if (*pos != 0)
+ return fib_route_get_idx(iter, *pos);
+
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ iter->key = 0;
+
+ return SEQ_START_TOKEN;
+}
+
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct key_vector *l = NULL;
+ t_key key = iter->key;
+
+ ++*pos;
+
+ /* only allow key of 0 for start of sequence */
+ if ((v == SEQ_START_TOKEN) || key)
+ l = leaf_walk_rcu(&iter->tnode, key);
+
+ if (l) {
+ iter->key = l->key + 1;
+ iter->pos++;
+ } else {
+ iter->pos = 0;
+ }
+
+ return l;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+{
+ unsigned int flags = 0;
+
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == htonl(0xFFFFFFFF))
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
+}
+
+/*
+ * This outputs /proc/net/route.
+ * The format of the file is not supposed to be changed
+ * and needs to be same as fib_hash output to avoid breaking
+ * legacy utilities
+ */
+static int fib_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb = iter->main_tb;
+ struct fib_alias *fa;
+ struct key_vector *l = v;
+ __be32 prefix;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ return 0;
+ }
+
+ prefix = htonl(l->key);
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+
+ if ((fa->fa_type == RTN_BROADCAST) ||
+ (fa->fa_type == RTN_MULTICAST))
+ continue;
+
+ if (fa->tb_id != tb->tb_id)
+ continue;
+
+ seq_setwidth(seq, 127);
+
+ if (fi)
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_pad(seq, '\n');
+ }
+
+ return 0;
+}
+
+static const struct seq_operations fib_route_seq_ops = {
+ .start = fib_route_seq_start,
+ .next = fib_route_seq_next,
+ .stop = fib_route_seq_stop,
+ .show = fib_route_seq_show,
+};
+
+static int fib_route_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &fib_route_seq_ops,
+ sizeof(struct fib_route_iter));
+}
+
+static const struct file_operations fib_route_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_route_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+int __net_init fib_proc_init(struct net *net)
+{
+ if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
+ goto out1;
+
+ if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ &fib_triestat_fops))
+ goto out2;
+
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
+ goto out3;
+
+ return 0;
+
+out3:
+ remove_proc_entry("fib_triestat", net->proc_net);
+out2:
+ remove_proc_entry("fib_trie", net->proc_net);
+out1:
+ return -ENOMEM;
+}
+
+void __net_exit fib_proc_exit(struct net *net)
+{
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
new file mode 100644
index 000000000..34968cd5c
--- /dev/null
+++ b/net/ipv4/fou.c
@@ -0,0 +1,999 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include <net/gue.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/xfrm.h>
+#include <uapi/linux/fou.h>
+#include <uapi/linux/genetlink.h>
+
+struct fou {
+ struct socket *sock;
+ u8 protocol;
+ u8 flags;
+ __be16 port;
+ u16 type;
+ struct udp_offload udp_offloads;
+ struct list_head list;
+};
+
+#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
+
+struct fou_cfg {
+ u16 type;
+ u8 protocol;
+ u8 flags;
+ struct udp_port_cfg udp_config;
+};
+
+static unsigned int fou_net_id;
+
+struct fou_net {
+ struct list_head fou_list;
+ struct mutex fou_lock;
+};
+
+static inline struct fou *fou_from_sock(struct sock *sk)
+{
+ return sk->sk_user_data;
+}
+
+static void fou_recv_pull(struct sk_buff *skb, size_t len)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ /* Remove 'len' bytes from the packet (UDP header and
+ * FOU header if present).
+ */
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+ skb_reset_transport_header(skb);
+}
+
+static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+
+ if (!fou)
+ return 1;
+
+ fou_recv_pull(skb, sizeof(struct udphdr));
+
+ return -fou->protocol;
+}
+
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+ void *data, size_t hdrlen, u8 ipproto,
+ bool nopartial)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+
+ if (!pskb_may_pull(skb, plen))
+ return NULL;
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ skb_remcsum_process(skb, (void *)guehdr + hdrlen,
+ start, offset, nopartial);
+
+ return guehdr;
+}
+
+static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
+{
+ /* No support yet */
+ kfree_skb(skb);
+ return 0;
+}
+
+static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+ size_t len, optlen, hdrlen;
+ struct guehdr *guehdr;
+ void *data;
+ u16 doffset = 0;
+
+ if (!fou)
+ return 1;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
+
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ /* guehdr may change after pull */
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
+ goto drop;
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+
+ /* Pull csum through the guehdr now . This can be used if
+ * there is a remote checksum offload.
+ */
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_remcsum(skb, guehdr, data + doffset,
+ hdrlen, guehdr->proto_ctype,
+ !!(fou->flags &
+ FOU_F_REMCSUM_NOPARTIAL));
+ if (!guehdr)
+ goto drop;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
+ }
+
+ if (unlikely(guehdr->control))
+ return gue_control_message(skb, guehdr);
+
+ __skb_pull(skb, sizeof(struct udphdr) + hdrlen);
+ skb_reset_transport_header(skb);
+
+ return -guehdr->proto_ctype;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct sk_buff **fou_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ u8 proto = NAPI_GRO_CB(skb)->proto;
+ const struct net_offload **offloads;
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return pp;
+}
+
+static int fou_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+{
+ const struct net_offload *ops;
+ u8 proto = NAPI_GRO_CB(skb)->proto;
+ int err = -ENOSYS;
+ const struct net_offload **offloads;
+
+ udp_tunnel_gro_complete(skb, nhoff);
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+ struct guehdr *guehdr, void *data,
+ size_t hdrlen, u8 ipproto,
+ struct gro_remcsum *grc, bool nopartial)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+
+ if (skb->remcsum_offload)
+ return NULL;
+
+ if (!NAPI_GRO_CB(skb)->csum_valid)
+ return NULL;
+
+ /* Pull checksum that will be written */
+ if (skb_gro_header_hard(skb, off + plen)) {
+ guehdr = skb_gro_header_slow(skb, off + plen, off);
+ if (!guehdr)
+ return NULL;
+ }
+
+ skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
+ start, offset, grc, nopartial);
+
+ skb->remcsum_offload = 1;
+
+ return guehdr;
+}
+
+static struct sk_buff **gue_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+{
+ const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct guehdr *guehdr;
+ size_t len, optlen, hdrlen, off;
+ void *data;
+ u16 doffset = 0;
+ int flush = 1;
+ struct fou *fou = container_of(uoff, struct fou, udp_offloads);
+ struct gro_remcsum grc;
+
+ skb_gro_remcsum_init(&grc);
+
+ off = skb_gro_offset(skb);
+ len = off + sizeof(*guehdr);
+
+ guehdr = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
+
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
+
+ if (unlikely(guehdr->control) || guehdr->version != 0 ||
+ validate_gue_flags(guehdr, optlen))
+ goto out;
+
+ hdrlen = sizeof(*guehdr) + optlen;
+
+ /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+ * this is needed if there is a remote checkcsum offload.
+ */
+ skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_gro_remcsum(skb, off, guehdr,
+ data + doffset, hdrlen,
+ guehdr->proto_ctype, &grc,
+ !!(fou->flags &
+ FOU_F_REMCSUM_NOPARTIAL));
+ if (!guehdr)
+ goto out;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
+ }
+
+ skb_gro_pull(skb, hdrlen);
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct guehdr *guehdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ guehdr2 = (struct guehdr *)(p->data + off);
+
+ /* Compare base GUE header to be equal (covers
+ * hlen, version, proto_ctype, and flags.
+ */
+ if (guehdr->word != guehdr2->word) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Compare optional fields are the same. */
+ if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
+ guehdr->hlen << 2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[guehdr->proto_ctype]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ goto out_unlock;
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_remcsum_cleanup(skb, &grc);
+
+ return pp;
+}
+
+static int gue_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+{
+ const struct net_offload **offloads;
+ struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ unsigned int guehlen;
+ u8 proto;
+ int err = -ENOENT;
+
+ proto = guehdr->proto_ctype;
+
+ guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int fou_add_to_port_list(struct net *net, struct fou *fou)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (fou->port == fout->port) {
+ mutex_unlock(&fn->fou_lock);
+ return -EALREADY;
+ }
+ }
+
+ list_add(&fou->list, &fn->fou_list);
+ mutex_unlock(&fn->fou_lock);
+
+ return 0;
+}
+
+static void fou_release(struct fou *fou)
+{
+ struct socket *sock = fou->sock;
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_family == AF_INET)
+ udp_del_offload(&fou->udp_offloads);
+ list_del(&fou->list);
+ udp_tunnel_sock_release(sock);
+
+ kfree(fou);
+}
+
+static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+ udp_sk(sk)->encap_rcv = fou_udp_recv;
+ fou->protocol = cfg->protocol;
+ fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
+ fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
+ fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+ fou->udp_offloads.ipproto = cfg->protocol;
+
+ return 0;
+}
+
+static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+ udp_sk(sk)->encap_rcv = gue_udp_recv;
+ fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
+ fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
+ fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+
+ return 0;
+}
+
+static int fou_create(struct net *net, struct fou_cfg *cfg,
+ struct socket **sockp)
+{
+ struct socket *sock = NULL;
+ struct fou *fou = NULL;
+ struct sock *sk;
+ int err;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &cfg->udp_config, &sock);
+ if (err < 0)
+ goto error;
+
+ /* Allocate FOU port structure */
+ fou = kzalloc(sizeof(*fou), GFP_KERNEL);
+ if (!fou) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ sk = sock->sk;
+
+ fou->flags = cfg->flags;
+ fou->port = cfg->udp_config.local_udp_port;
+
+ /* Initial for fou type */
+ switch (cfg->type) {
+ case FOU_ENCAP_DIRECT:
+ err = fou_encap_init(sk, fou, cfg);
+ if (err)
+ goto error;
+ break;
+ case FOU_ENCAP_GUE:
+ err = gue_encap_init(sk, fou, cfg);
+ if (err)
+ goto error;
+ break;
+ default:
+ err = -EINVAL;
+ goto error;
+ }
+
+ fou->type = cfg->type;
+
+ udp_sk(sk)->encap_type = 1;
+ udp_encap_enable();
+
+ sk->sk_user_data = fou;
+ fou->sock = sock;
+
+ inet_inc_convert_csum(sk);
+
+ sk->sk_allocation = GFP_ATOMIC;
+
+ if (cfg->udp_config.family == AF_INET) {
+ err = udp_add_offload(&fou->udp_offloads);
+ if (err)
+ goto error;
+ }
+
+ err = fou_add_to_port_list(net, fou);
+ if (err)
+ goto error;
+
+ if (sockp)
+ *sockp = sock;
+
+ return 0;
+
+error:
+ kfree(fou);
+ if (sock)
+ udp_tunnel_sock_release(sock);
+
+ return err;
+}
+
+static int fou_destroy(struct net *net, struct fou_cfg *cfg)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ __be16 port = cfg->udp_config.local_udp_port;
+ int err = -EINVAL;
+ struct fou *fou;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fou, &fn->fou_list, list) {
+ if (fou->port == port) {
+ fou_release(fou);
+ err = 0;
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+
+ return err;
+}
+
+static struct genl_family fou_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .netnsok = true,
+};
+
+static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
+ [FOU_ATTR_PORT] = { .type = NLA_U16, },
+ [FOU_ATTR_AF] = { .type = NLA_U8, },
+ [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+ [FOU_ATTR_TYPE] = { .type = NLA_U8, },
+ [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+};
+
+static int parse_nl_config(struct genl_info *info,
+ struct fou_cfg *cfg)
+{
+ memset(cfg, 0, sizeof(*cfg));
+
+ cfg->udp_config.family = AF_INET;
+
+ if (info->attrs[FOU_ATTR_AF]) {
+ u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
+ cfg->udp_config.family = family;
+ }
+
+ if (info->attrs[FOU_ATTR_PORT]) {
+ __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
+
+ cfg->udp_config.local_udp_port = port;
+ }
+
+ if (info->attrs[FOU_ATTR_IPPROTO])
+ cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
+
+ if (info->attrs[FOU_ATTR_TYPE])
+ cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
+
+ if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
+ cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
+
+ return 0;
+}
+
+static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_create(net, &cfg, NULL);
+}
+
+static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_destroy(net, &cfg);
+}
+
+static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
+{
+ if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
+ nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+ nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
+ nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
+ return -1;
+
+ if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
+ if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
+ return -1;
+ return 0;
+}
+
+static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
+ u32 flags, struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (fou_fill_info(fou, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct sk_buff *msg;
+ struct fou_cfg cfg;
+ struct fou *fout;
+ __be16 port;
+ int ret;
+
+ ret = parse_nl_config(info, &cfg);
+ if (ret)
+ return ret;
+ port = cfg.udp_config.local_udp_port;
+ if (port == 0)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = -ESRCH;
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (port == fout->port) {
+ ret = fou_dump_info(fout, info->snd_portid,
+ info->snd_seq, 0, msg,
+ info->genlhdr->cmd);
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+ if (ret < 0)
+ goto out_free;
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+ int idx = 0, ret;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (idx++ < cb->args[0])
+ continue;
+ ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, FOU_CMD_GET);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fn->fou_lock);
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static const struct genl_ops fou_nl_ops[] = {
+ {
+ .cmd = FOU_CMD_ADD,
+ .doit = fou_nl_cmd_add_port,
+ .policy = fou_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_DEL,
+ .doit = fou_nl_cmd_rm_port,
+ .policy = fou_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_GET,
+ .doit = fou_nl_cmd_get_port,
+ .dumpit = fou_nl_dump,
+ .policy = fou_nl_policy,
+ },
+};
+
+size_t fou_encap_hlen(struct ip_tunnel_encap *e)
+{
+ return sizeof(struct udphdr);
+}
+EXPORT_SYMBOL(fou_encap_hlen);
+
+size_t gue_encap_hlen(struct ip_tunnel_encap *e)
+{
+ size_t len;
+ bool need_priv = false;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+
+ if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+ len += GUE_PLEN_REMCSUM;
+ need_priv = true;
+ }
+
+ len += need_priv ? GUE_LEN_PRIV : 0;
+
+ return len;
+}
+EXPORT_SYMBOL(gue_encap_hlen);
+
+static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi4 *fl4, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ uh->check = 0;
+ udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+ fl4->saddr, fl4->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(fou_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ struct guehdr *guehdr;
+ size_t hdrlen, optlen = 0;
+ __be16 sport;
+ void *data;
+ bool need_priv = false;
+
+ if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ csum = false;
+ optlen += GUE_PLEN_REMCSUM;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+ need_priv = true;
+ }
+
+ optlen += need_priv ? GUE_LEN_PRIV : 0;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Get source port (based on flow hash) before skb_push */
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
+
+ guehdr = (struct guehdr *)skb->data;
+
+ guehdr->control = 0;
+ guehdr->version = 0;
+ guehdr->hlen = optlen >> 2;
+ guehdr->flags = 0;
+ guehdr->proto_ctype = *protocol;
+
+ data = &guehdr[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+
+ guehdr->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd = data;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
+ }
+
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(gue_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static const struct ip_tunnel_encap_ops fou_iptun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou_build_header,
+};
+
+static const struct ip_tunnel_encap_ops gue_iptun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue_build_header,
+};
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou ops\n");
+ return ret;
+ }
+
+ ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue ops\n");
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static __net_init int fou_init_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+
+ INIT_LIST_HEAD(&fn->fou_list);
+ mutex_init(&fn->fou_lock);
+ return 0;
+}
+
+static __net_exit void fou_exit_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fou, *next;
+
+ /* Close all the FOU sockets */
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry_safe(fou, next, &fn->fou_list, list)
+ fou_release(fou);
+ mutex_unlock(&fn->fou_lock);
+}
+
+static struct pernet_operations fou_net_ops = {
+ .init = fou_init_net,
+ .exit = fou_exit_net,
+ .id = &fou_net_id,
+ .size = sizeof(struct fou_net),
+};
+
+static int __init fou_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&fou_net_ops);
+ if (ret)
+ goto exit;
+
+ ret = genl_register_family_with_ops(&fou_nl_family,
+ fou_nl_ops);
+ if (ret < 0)
+ goto unregister;
+
+ ret = ip_tunnel_encap_add_fou_ops();
+ if (ret == 0)
+ return 0;
+
+ genl_unregister_family(&fou_nl_family);
+unregister:
+ unregister_pernet_device(&fou_net_ops);
+exit:
+ return ret;
+}
+
+static void __exit fou_fini(void)
+{
+ ip_tunnel_encap_del_fou_ops();
+ genl_unregister_family(&fou_nl_family);
+ unregister_pernet_device(&fou_net_ops);
+}
+
+module_init(fou_init);
+module_exit(fou_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
new file mode 100644
index 000000000..8986e63f3
--- /dev/null
+++ b/net/ipv4/geneve.c
@@ -0,0 +1,453 @@
+/*
+ * Geneve: Generic Network Virtualization Encapsulation
+ *
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ethtool.h>
+#include <linux/mutex.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/geneve.h>
+#include <net/protocol.h>
+#include <net/udp_tunnel.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#endif
+
+/* Protects sock_list and refcounts. */
+static DEFINE_MUTEX(geneve_mutex);
+
+/* per-network namespace private data for this module */
+struct geneve_net {
+ struct list_head sock_list;
+};
+
+static int geneve_net_id;
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+ return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+static struct geneve_sock *geneve_find_sock(struct net *net,
+ sa_family_t family, __be16 port)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_sock *gs;
+
+ list_for_each_entry(gs, &gn->sock_list, list) {
+ if (inet_sk(gs->sock->sk)->inet_sport == port &&
+ inet_sk(gs->sock->sk)->sk.sk_family == family)
+ return gs;
+ }
+
+ return NULL;
+}
+
+static void geneve_build_header(struct genevehdr *geneveh,
+ __be16 tun_flags, u8 vni[3],
+ u8 options_len, u8 *options)
+{
+ geneveh->ver = GENEVE_VER;
+ geneveh->opt_len = options_len / 4;
+ geneveh->oam = !!(tun_flags & TUNNEL_OAM);
+ geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+ geneveh->rsvd1 = 0;
+ memcpy(geneveh->vni, vni, 3);
+ geneveh->proto_type = htons(ETH_P_TEB);
+ geneveh->rsvd2 = 0;
+
+ memcpy(geneveh->options, options, options_len);
+}
+
+/* Transmit a fully formatted Geneve frame.
+ *
+ * When calling this function. The skb->data should point
+ * to the geneve header which is fully formed.
+ *
+ * This function will add other UDP tunnel headers.
+ */
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+ struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+ __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+ __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+ bool csum, bool xnet)
+{
+ struct genevehdr *gnvh;
+ int min_headroom;
+ int err;
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+ + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb))
+ return -ENOMEM;
+
+ skb = udp_tunnel_handle_offloads(skb, csum);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+ geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
+ return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
+ tos, ttl, df, src_port, dst_port, xnet,
+ !csum);
+}
+EXPORT_SYMBOL_GPL(geneve_xmit_skb);
+
+static int geneve_hlen(struct genevehdr *gh)
+{
+ return sizeof(*gh) + gh->opt_len * 4;
+}
+
+static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+{
+ struct sk_buff *p, **pp = NULL;
+ struct genevehdr *gh, *gh2;
+ unsigned int hlen, gh_len, off_gnv;
+ const struct packet_offload *ptype;
+ __be16 type;
+ int flush = 1;
+
+ off_gnv = skb_gro_offset(skb);
+ hlen = off_gnv + sizeof(*gh);
+ gh = skb_gro_header_fast(skb, off_gnv);
+ if (skb_gro_header_hard(skb, hlen)) {
+ gh = skb_gro_header_slow(skb, hlen, off_gnv);
+ if (unlikely(!gh))
+ goto out;
+ }
+
+ if (gh->ver != GENEVE_VER || gh->oam)
+ goto out;
+ gh_len = geneve_hlen(gh);
+
+ hlen = off_gnv + gh_len;
+ if (skb_gro_header_hard(skb, hlen)) {
+ gh = skb_gro_header_slow(skb, hlen, off_gnv);
+ if (unlikely(!gh))
+ goto out;
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ gh2 = (struct genevehdr *)(p->data + off_gnv);
+ if (gh->opt_len != gh2->opt_len ||
+ memcmp(gh, gh2, gh_len)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ type = gh->proto_type;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype) {
+ flush = 1;
+ goto out_unlock;
+ }
+
+ skb_gro_pull(skb, gh_len);
+ skb_gro_postpull_rcsum(skb, gh, gh_len);
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+{
+ struct genevehdr *gh;
+ struct packet_offload *ptype;
+ __be16 type;
+ int gh_len;
+ int err = -ENOSYS;
+
+ udp_tunnel_gro_complete(skb, nhoff);
+
+ gh = (struct genevehdr *)(skb->data + nhoff);
+ gh_len = geneve_hlen(gh);
+ type = gh->proto_type;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
+
+ rcu_read_unlock();
+ return err;
+}
+
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+ int err;
+
+ if (sa_family == AF_INET) {
+ err = udp_add_offload(&gs->udp_offloads);
+ if (err)
+ pr_warn("geneve: udp_add_offload failed with status %d\n",
+ err);
+ }
+}
+
+static void geneve_notify_del_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+
+ if (sa_family == AF_INET)
+ udp_del_offload(&gs->udp_offloads);
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct genevehdr *geneveh;
+ struct geneve_sock *gs;
+ int opts_len;
+
+ /* Need Geneve and inner Ethernet header to be present */
+ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+ goto error;
+
+ /* Return packets with reserved bits set */
+ geneveh = geneve_hdr(skb);
+
+ if (unlikely(geneveh->ver != GENEVE_VER))
+ goto error;
+
+ if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+ goto error;
+
+ opts_len = geneveh->opt_len * 4;
+ if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+ htons(ETH_P_TEB)))
+ goto drop;
+
+ gs = rcu_dereference_sk_user_data(sk);
+ if (!gs)
+ goto drop;
+
+ gs->rcv(gs, skb);
+ return 0;
+
+drop:
+ /* Consume bad packet */
+ kfree_skb(skb);
+ return 0;
+
+error:
+ /* Let the UDP layer deal with the skb */
+ return 1;
+}
+
+static struct socket *geneve_create_sock(struct net *net, bool ipv6,
+ __be16 port)
+{
+ struct socket *sock;
+ struct udp_port_cfg udp_conf;
+ int err;
+
+ memset(&udp_conf, 0, sizeof(udp_conf));
+
+ if (ipv6) {
+ udp_conf.family = AF_INET6;
+ } else {
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ }
+
+ udp_conf.local_udp_port = port;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ return sock;
+}
+
+/* Create new listen socket if needed */
+static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
+ geneve_rcv_t *rcv, void *data,
+ bool ipv6)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_sock *gs;
+ struct socket *sock;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
+
+ gs = kzalloc(sizeof(*gs), GFP_KERNEL);
+ if (!gs)
+ return ERR_PTR(-ENOMEM);
+
+ sock = geneve_create_sock(net, ipv6, port);
+ if (IS_ERR(sock)) {
+ kfree(gs);
+ return ERR_CAST(sock);
+ }
+
+ gs->sock = sock;
+ gs->refcnt = 1;
+ gs->rcv = rcv;
+ gs->rcv_data = data;
+
+ /* Initialize the geneve udp offloads structure */
+ gs->udp_offloads.port = port;
+ gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive;
+ gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
+ geneve_notify_add_rx_port(gs);
+
+ /* Mark socket as an encapsulation socket */
+ tunnel_cfg.sk_user_data = gs;
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+ tunnel_cfg.encap_destroy = NULL;
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+ list_add(&gs->list, &gn->sock_list);
+
+ return gs;
+}
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+ geneve_rcv_t *rcv, void *data,
+ bool no_share, bool ipv6)
+{
+ struct geneve_sock *gs;
+
+ mutex_lock(&geneve_mutex);
+
+ gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
+ if (gs) {
+ if (!no_share && gs->rcv == rcv)
+ gs->refcnt++;
+ else
+ gs = ERR_PTR(-EBUSY);
+ } else {
+ gs = geneve_socket_create(net, port, rcv, data, ipv6);
+ }
+
+ mutex_unlock(&geneve_mutex);
+
+ return gs;
+}
+EXPORT_SYMBOL_GPL(geneve_sock_add);
+
+void geneve_sock_release(struct geneve_sock *gs)
+{
+ mutex_lock(&geneve_mutex);
+
+ if (--gs->refcnt)
+ goto unlock;
+
+ list_del(&gs->list);
+ geneve_notify_del_rx_port(gs);
+ udp_tunnel_sock_release(gs->sock);
+ kfree_rcu(gs, rcu);
+
+unlock:
+ mutex_unlock(&geneve_mutex);
+}
+EXPORT_SYMBOL_GPL(geneve_sock_release);
+
+static __net_init int geneve_init_net(struct net *net)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+
+ INIT_LIST_HEAD(&gn->sock_list);
+
+ return 0;
+}
+
+static struct pernet_operations geneve_net_ops = {
+ .init = geneve_init_net,
+ .id = &geneve_net_id,
+ .size = sizeof(struct geneve_net),
+};
+
+static int __init geneve_init_module(void)
+{
+ int rc;
+
+ rc = register_pernet_subsys(&geneve_net_ops);
+ if (rc)
+ return rc;
+
+ pr_info("Geneve driver\n");
+
+ return 0;
+}
+module_init(geneve_init_module);
+
+static void __exit geneve_cleanup_module(void)
+{
+ unregister_pernet_subsys(&geneve_net_ops);
+}
+module_exit(geneve_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
+MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
+MODULE_ALIAS_RTNL_LINK("geneve");
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 000000000..4a7b5b2a1
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,367 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ int ret;
+
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+ 0 : -EBUSY;
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
+
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(gre_build_header);
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
+
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
+
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static int gre_cisco_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ int i;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+ int ret;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+ ret = proto->handler(skb, &tpi);
+ if (ret == PACKET_RCVD) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void gre_cisco_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int i;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+
+ if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
+ goto out;
+
+ }
+out:
+ rcu_read_unlock();
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+
+ if (ver >= GREPROTO_MAX)
+ return;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_cisco_rcv,
+ .err_handler = gre_cisco_err,
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *newp)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[newp->priority];
+
+ return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_register);
+
+int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[del_proto->priority];
+ int ret;
+
+ ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
+
+ if (ret)
+ return ret;
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_unregister);
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver\n");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("can't add protocol\n");
+ goto err;
+ }
+
+ if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
+ pr_info("%s: can't add ipgre handler\n", __func__);
+ goto err_gre;
+ }
+
+ return 0;
+err_gre:
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+err:
+ return -EAGAIN;
+}
+
+static void __exit gre_exit(void)
+{
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 000000000..5aa46d4b4
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,268 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl;
+ struct gre_base_hdr *greh;
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ __be16 protocol = skb->protocol;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP)))
+ goto out;
+
+ if (!skb->encapsulation)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ if (unlikely(ghl < sizeof(*greh)))
+ goto out;
+
+ csum = !!(greh->flags & GRE_CSUM);
+ if (csum)
+ skb->encap_hdr_csum = 1;
+
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & features;
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+ goto out;
+ }
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb_list(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ skb_reset_transport_header(skb);
+
+ greh = (struct gre_base_hdr *)
+ skb_transport_header(skb);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static struct sk_buff **gre_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct gre_base_hdr *greh;
+ unsigned int hlen, grehlen;
+ unsigned int off;
+ int flush = 1;
+ struct packet_offload *ptype;
+ __be16 type;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*greh);
+ greh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+ }
+
+ /* Only support version 0 and K (key), C (csum) flags. Note that
+ * although the support for the S (seq#) flag can be added easily
+ * for GRO, this is problematic for GSO hence can not be enabled
+ * here because a GRO pkt may end up in the forwarding path, thus
+ * requiring GSO support to break it up correctly.
+ */
+ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+ goto out;
+
+ type = greh->protocol;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out_unlock;
+
+ grehlen = GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ hlen = off + grehlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out_unlock;
+ }
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
+ if (skb_gro_checksum_simple_validate(skb))
+ goto out_unlock;
+
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct gre_base_hdr *greh2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ /* The following checks are needed to ensure only pkts
+ * from the same tunnel are considered for aggregation.
+ * The criteria for "the same tunnel" includes:
+ * 1) same version (we only support version 0 here)
+ * 2) same protocol (we only support ETH_P_IP for now)
+ * 3) same set of flags
+ * 4) same key if the key field is present.
+ */
+ greh2 = (struct gre_base_hdr *)(p->data + off);
+
+ if (greh2->flags != greh->flags ||
+ greh2->protocol != greh->protocol) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (greh->flags & GRE_KEY) {
+ /* compare keys */
+ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+ }
+
+ skb_gro_pull(skb, grehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+ struct packet_offload *ptype;
+ unsigned int grehlen = sizeof(*greh);
+ int err = -ENOENT;
+ __be16 type;
+
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+ type = greh->protocol;
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+ rcu_read_unlock();
+
+ skb_set_inner_mac_header(skb, nhoff + grehlen);
+
+ return err;
+}
+
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_segment = gre_gso_segment,
+ .gro_receive = gre_gro_receive,
+ .gro_complete = gre_gro_complete,
+ },
+};
+
+static int __init gre_offload_init(void)
+{
+ return inet_add_offload(&gre_offload, IPPROTO_GRE);
+}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 000000000..f5203fba6
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1218 @@
+/*
+ * NET3: Implementation of the ICMP protocol layer.
+ *
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Some of the function names and the icmp unreach table for this
+ * module were derived from [icmp.c 1.0.11 06/02/93] by
+ * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ * Other than that this module is a complete rewrite.
+ *
+ * Fixes:
+ * Clemens Fruhwirth : introduce global icmp rate limiting
+ * with icmp type masking ability instead
+ * of broken per type icmp timeouts.
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Multicast ping reply as self.
+ * Alan Cox : Fix atomicity lockup in ip_build_xmit
+ * call.
+ * Alan Cox : Added 216,128 byte paths to the MTU
+ * code.
+ * Martin Mares : RFC1812 checks.
+ * Martin Mares : Can be configured to follow redirects
+ * if acting as a router _without_ a
+ * routing protocol (RFC 1812).
+ * Martin Mares : Echo requests may be configured to
+ * be ignored (RFC 1812).
+ * Martin Mares : Limitation of ICMP error message
+ * transmit rate (RFC 1812).
+ * Martin Mares : TOS and Precedence set correctly
+ * (RFC 1812).
+ * Martin Mares : Now copying as much data from the
+ * original packet as we can without
+ * exceeding 576 bytes (RFC 1812).
+ * Willy Konynenberg : Transparent proxying support.
+ * Keith Owens : RFC1191 correction for 4.2BSD based
+ * path MTU bug.
+ * Thomas Quinot : ICMP Dest Unreach codes up to 15 are
+ * valid (RFC 1812).
+ * Andi Kleen : Check all packet lengths properly
+ * and moved all kfree_skb() up to
+ * icmp_rcv.
+ * Andi Kleen : Move the rate limit bookkeeping
+ * into the dest entry and use a token
+ * bucket filter (thanks to ANK). Make
+ * the rates sysctl configurable.
+ * Yu Tianli : Fixed two ugly bugs in icmp_send
+ * - IP option length was accounted wrongly
+ * - ICMP header length was not accounted
+ * at all.
+ * Tristan Greaves : Added sysctl option to ignore bogus
+ * broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ * - Should use skb_pull() instead of all the manual checking.
+ * This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/ip_fib.h>
+
+/*
+ * Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+ struct sk_buff *skb;
+ int offset;
+ int data_len;
+
+ struct {
+ struct icmphdr icmph;
+ __be32 times[3];
+ } data;
+ int head_len;
+ struct ip_options_data replyopts;
+};
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+const struct icmp_err icmp_err_convert[] = {
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
+ .fatal = 1,
+ },
+ {
+ .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
+ .fatal = 1,
+ },
+ {
+ .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
+ .fatal = 0,
+ },
+ {
+ .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
+ .fatal = 0,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = ENONET, /* ICMP_HOST_ISOLATED */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
+ .fatal = 1,
+ },
+};
+EXPORT_SYMBOL(icmp_err_convert);
+
+/*
+ * ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+ bool (*handler)(struct sk_buff *skb);
+ short error; /* This ICMP is classed as an error message */
+};
+
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ * The ICMP socket(s). This is the most convenient way to flow control
+ * our ICMP output as well as maintain a clean interface throughout
+ * all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
+ */
+static struct sock *icmp_sk(struct net *net)
+{
+ return *this_cpu_ptr(net->ipv4.icmp_sk);
+}
+
+static inline struct sock *icmp_xmit_lock(struct net *net)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+
+ sk = icmp_sk(net);
+
+ if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
+ /* This can happen if the output path signals a
+ * dst_link_failure() for an outgoing ICMP packet.
+ */
+ local_bh_enable();
+ return NULL;
+ }
+ return sk;
+}
+
+static inline void icmp_xmit_unlock(struct sock *sk)
+{
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
+int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
+int sysctl_icmp_msgs_burst __read_mostly = 50;
+
+static struct {
+ spinlock_t lock;
+ u32 credit;
+ u32 stamp;
+} icmp_global = {
+ .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
+};
+
+/**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ *
+ * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+ * Note: called with BH disabled
+ */
+bool icmp_global_allow(void)
+{
+ u32 credit, delta, incr = 0, now = (u32)jiffies;
+ bool rc = false;
+
+ /* Check if token bucket is empty and cannot be refilled
+ * without taking the spinlock.
+ */
+ if (!icmp_global.credit) {
+ delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (delta < HZ / 50)
+ return false;
+ }
+
+ spin_lock(&icmp_global.lock);
+ delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (delta >= HZ / 50) {
+ incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
+ if (incr)
+ icmp_global.stamp = now;
+ }
+ credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
+ if (credit) {
+ credit--;
+ rc = true;
+ }
+ icmp_global.credit = credit;
+ spin_unlock(&icmp_global.lock);
+ return rc;
+}
+EXPORT_SYMBOL(icmp_global_allow);
+
+/*
+ * Send an ICMP frame.
+ */
+
+static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+ struct flowi4 *fl4, int type, int code)
+{
+ struct dst_entry *dst = &rt->dst;
+ bool rc = true;
+
+ if (type > NR_ICMP_TYPES)
+ goto out;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ goto out;
+
+ /* No rate limit on loopback */
+ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ goto out;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ goto out;
+
+ rc = false;
+ if (icmp_global_allow()) {
+ struct inet_peer *peer;
+
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ rc = inet_peer_xrlim_allow(peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
+ }
+out:
+ return rc;
+}
+
+/*
+ * Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+void icmp_out_count(struct net *net, unsigned char type)
+{
+ ICMPMSGOUT_INC_STATS(net, type);
+ ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
+}
+
+/*
+ * Checksum each fragment, and on the first include the headers and final
+ * checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ __wsum csum;
+
+ csum = skb_copy_and_csum_bits(icmp_param->skb,
+ icmp_param->offset + offset,
+ to, len, 0);
+
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ if (icmp_pointers[icmp_param->data.icmph.type].error)
+ nf_ct_attach(skb, icmp_param->skb);
+ return 0;
+}
+
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+ struct flowi4 *fl4,
+ struct ipcm_cookie *ipc, struct rtable **rt)
+{
+ struct sock *sk;
+ struct sk_buff *skb;
+
+ sk = icmp_sk(dev_net((*rt)->dst.dev));
+ if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT) < 0) {
+ ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
+ ip_flush_pending_frames(sk);
+ } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ struct icmphdr *icmph = icmp_hdr(skb);
+ __wsum csum = 0;
+ struct sk_buff *skb1;
+
+ skb_queue_walk(&sk->sk_write_queue, skb1) {
+ csum = csum_add(csum, skb1->csum);
+ }
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+ (char *)icmph,
+ icmp_param->head_len, csum);
+ icmph->checksum = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_push_pending_frames(sk, fl4);
+ }
+}
+
+/*
+ * Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+ struct ipcm_cookie ipc;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
+ struct flowi4 fl4;
+ struct sock *sk;
+ struct inet_sock *inet;
+ __be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
+
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
+ return;
+
+ sk = icmp_xmit_lock(net);
+ if (!sk)
+ return;
+ inet = inet_sk(sk);
+
+ icmp_param->data.icmph.checksum = 0;
+
+ inet->tos = ip_hdr(skb)->tos;
+ sk->sk_mark = mark;
+ daddr = ipc.addr = ip_hdr(skb)->saddr;
+ saddr = fib_compute_spec_dst(skb);
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ if (icmp_param->replyopts.opt.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts.opt;
+ if (ipc.opt->opt.srr)
+ daddr = icmp_param->replyopts.opt.opt.faddr;
+ }
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_proto = IPPROTO_ICMP;
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ goto out_unlock;
+ if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
+ icmp_param->data.icmph.code))
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock(sk);
+}
+
+static struct rtable *icmp_route_lookup(struct net *net,
+ struct flowi4 *fl4,
+ struct sk_buff *skb_in,
+ const struct iphdr *iph,
+ __be32 saddr, u8 tos, u32 mark,
+ int type, int code,
+ struct icmp_bxm *param)
+{
+ struct rtable *rt, *rt2;
+ struct flowi4 fl4_dec;
+ int err;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = (param->replyopts.opt.opt.srr ?
+ param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
+ fl4->flowi4_tos = RT_TOS(tos);
+ fl4->flowi4_proto = IPPROTO_ICMP;
+ fl4->fl4_icmp_type = type;
+ fl4->fl4_icmp_code = code;
+ security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ rt = __ip_route_output_key(net, fl4);
+ if (IS_ERR(rt))
+ return rt;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ if (!IS_ERR(rt)) {
+ if (rt != rt2)
+ return rt;
+ } else if (PTR_ERR(rt) == -EPERM) {
+ rt = NULL;
+ } else
+ return rt;
+
+ err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+ if (err)
+ goto relookup_failed;
+
+ if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4_dec);
+ if (IS_ERR(rt2))
+ err = PTR_ERR(rt2);
+ } else {
+ struct flowi4 fl4_2 = {};
+ unsigned long orefdst;
+
+ fl4_2.daddr = fl4_dec.saddr;
+ rt2 = ip_route_output_key(net, &fl4_2);
+ if (IS_ERR(rt2)) {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ /* Ugh! */
+ orefdst = skb_in->_skb_refdst; /* save old refdst */
+ err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+ RT_TOS(tos), rt2->dst.dev);
+
+ dst_release(&rt2->dst);
+ rt2 = skb_rtable(skb_in);
+ skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ }
+
+ if (err)
+ goto relookup_failed;
+
+ rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+ flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(rt2)) {
+ dst_release(&rt->dst);
+ memcpy(fl4, &fl4_dec, sizeof(*fl4));
+ rt = rt2;
+ } else if (PTR_ERR(rt2) == -EPERM) {
+ if (rt)
+ dst_release(&rt->dst);
+ return rt2;
+ } else {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ return rt;
+
+relookup_failed:
+ if (rt)
+ return rt;
+ return ERR_PTR(err);
+}
+
+/*
+ * Send an ICMP message in response to a situation
+ *
+ * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
+ * MAY send more (we do).
+ * MUST NOT change this header information.
+ * MUST NOT reply to a multicast/broadcast IP address.
+ * MUST NOT reply to a multicast/broadcast MAC address.
+ * MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct iphdr *iph;
+ int room;
+ struct icmp_bxm *icmp_param;
+ struct rtable *rt = skb_rtable(skb_in);
+ struct ipcm_cookie ipc;
+ struct flowi4 fl4;
+ __be32 saddr;
+ u8 tos;
+ u32 mark;
+ struct net *net;
+ struct sock *sk;
+
+ if (!rt)
+ goto out;
+ net = dev_net(rt->dst.dev);
+
+ /*
+ * Find the original header. It is expected to be valid, of course.
+ * Check this, icmp_send is called from the most obscure devices
+ * sometimes.
+ */
+ iph = ip_hdr(skb_in);
+
+ if ((u8 *)iph < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(*iph)) >
+ skb_tail_pointer(skb_in))
+ goto out;
+
+ /*
+ * No replies to physical multicast/broadcast
+ */
+ if (skb_in->pkt_type != PACKET_HOST)
+ goto out;
+
+ /*
+ * Now check at the protocol level
+ */
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto out;
+
+ /*
+ * Only reply to fragment 0. We byte re-order the constant
+ * mask for efficiency.
+ */
+ if (iph->frag_off & htons(IP_OFFSET))
+ goto out;
+
+ /*
+ * If we send an ICMP error to an ICMP error a mess would result..
+ */
+ if (icmp_pointers[type].error) {
+ /*
+ * We are an error, check if we are replying to an
+ * ICMP error
+ */
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 _inner_type, *itp;
+
+ itp = skb_header_pointer(skb_in,
+ skb_network_header(skb_in) +
+ (iph->ihl << 2) +
+ offsetof(struct icmphdr,
+ type) -
+ skb_in->data,
+ sizeof(_inner_type),
+ &_inner_type);
+ if (!itp)
+ goto out;
+
+ /*
+ * Assume any unknown ICMP type is an error. This
+ * isn't specified by the RFC, but think about it..
+ */
+ if (*itp > NR_ICMP_TYPES ||
+ icmp_pointers[*itp].error)
+ goto out;
+ }
+ }
+
+ icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+ if (!icmp_param)
+ return;
+
+ sk = icmp_xmit_lock(net);
+ if (!sk)
+ goto out_free;
+
+ /*
+ * Construct source address and options.
+ */
+
+ saddr = iph->daddr;
+ if (!(rt->rt_flags & RTCF_LOCAL)) {
+ struct net_device *dev = NULL;
+
+ rcu_read_lock();
+ if (rt_is_input_route(rt) &&
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+ dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+
+ if (dev)
+ saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ else
+ saddr = 0;
+ rcu_read_unlock();
+ }
+
+ tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ IPTOS_PREC_INTERNETCONTROL) :
+ iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
+
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+ goto out_unlock;
+
+
+ /*
+ * Prepare data for ICMP header.
+ */
+
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
+ inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
+ ipc.addr = iph->saddr;
+ ipc.opt = &icmp_param->replyopts.opt;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
+ type, code, icmp_param);
+ if (IS_ERR(rt))
+ goto out_unlock;
+
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+ goto ende;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+
+ room = dst_mtu(&rt->dst);
+ if (room > 576)
+ room = 576;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+ room -= sizeof(struct icmphdr);
+
+ icmp_param->data_len = skb_in->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
+
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ende:
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock(sk);
+out_free:
+ kfree(icmp_param);
+out:;
+}
+EXPORT_SYMBOL(icmp_send);
+
+
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return;
+ }
+
+ raw_icmp_error(skb, protocol, info);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+}
+
+static bool icmp_tag_validation(int proto)
+{
+ bool ok;
+
+ rcu_read_lock();
+ ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+ rcu_read_unlock();
+ return ok;
+}
+
+/*
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * ICMP_PARAMETERPROB.
+ */
+
+static bool icmp_unreach(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ struct icmphdr *icmph;
+ struct net *net;
+ u32 info = 0;
+
+ net = dev_net(skb_dst(skb)->dev);
+
+ /*
+ * Incomplete header ?
+ * Only checks for the IP header, there should be an
+ * additional check for longer headers in upper levels.
+ */
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out_err;
+
+ icmph = icmp_hdr(skb);
+ iph = (const struct iphdr *)skb->data;
+
+ if (iph->ihl < 5) /* Mangled header, drop. */
+ goto out_err;
+
+ if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->code & 15) {
+ case ICMP_NET_UNREACH:
+ case ICMP_HOST_UNREACH:
+ case ICMP_PROT_UNREACH:
+ case ICMP_PORT_UNREACH:
+ break;
+ case ICMP_FRAG_NEEDED:
+ /* for documentation of the ip_no_pmtu_disc
+ * values please see
+ * Documentation/networking/ip-sysctl.txt
+ */
+ switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+ default:
+ net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
+ &iph->daddr);
+ break;
+ case 2:
+ goto out;
+ case 3:
+ if (!icmp_tag_validation(iph->protocol))
+ goto out;
+ /* fall through */
+ case 0:
+ info = ntohs(icmph->un.frag.mtu);
+ }
+ break;
+ case ICMP_SR_FAILED:
+ net_dbg_ratelimited("%pI4: Source Route Failed\n",
+ &iph->daddr);
+ break;
+ default:
+ break;
+ }
+ if (icmph->code > NR_ICMP_UNREACH)
+ goto out;
+ } else if (icmph->type == ICMP_PARAMETERPROB)
+ info = ntohl(icmph->un.gateway) >> 24;
+
+ /*
+ * Throw it at our lower layers
+ *
+ * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+ * header.
+ * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+ * transport layer.
+ * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+ * transport layer.
+ */
+
+ /*
+ * Check the other end isn't violating RFC 1122. Some routers send
+ * bogus responses to broadcast frames. If you see this message
+ * first check your netmask matches at both ends, if it does then
+ * get the other vendor to fix their kit.
+ */
+
+ if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+ inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
+ goto out;
+ }
+
+ icmp_socket_deliver(skb, info);
+
+out:
+ return true;
+out_err:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return false;
+}
+
+
+/*
+ * Handle ICMP_REDIRECT.
+ */
+
+static bool icmp_redirect(struct sk_buff *skb)
+{
+ if (skb->len < sizeof(struct iphdr)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return false;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+ /* there aught to be a stat */
+ return false;
+ }
+
+ icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ return true;
+}
+
+/*
+ * Handle ICMP_ECHO ("ping") requests.
+ *
+ * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ * requests.
+ * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ * included in the reply.
+ * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ * echo requests, MUST have default=NOT.
+ * See also WRT handling of options once they are done and working.
+ */
+
+static bool icmp_echo(struct sk_buff *skb)
+{
+ struct net *net;
+
+ net = dev_net(skb_dst(skb)->dev);
+ if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
+ struct icmp_bxm icmp_param;
+
+ icmp_param.data.icmph = *icmp_hdr(skb);
+ icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = skb->len;
+ icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_reply(&icmp_param, skb);
+ }
+ /* should there be an ICMP stat for ignored echos? */
+ return true;
+}
+
+/*
+ * Handle ICMP Timestamp requests.
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+static bool icmp_timestamp(struct sk_buff *skb)
+{
+ struct timespec tv;
+ struct icmp_bxm icmp_param;
+ /*
+ * Too short.
+ */
+ if (skb->len < 4)
+ goto out_err;
+
+ /*
+ * Fill in the current time as ms since midnight UT:
+ */
+ getnstimeofday(&tv);
+ icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
+ tv.tv_nsec / NSEC_PER_MSEC);
+ icmp_param.data.times[2] = icmp_param.data.times[1];
+ if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+ BUG();
+ icmp_param.data.icmph = *icmp_hdr(skb);
+ icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param.data.icmph.code = 0;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = 0;
+ icmp_param.head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(&icmp_param, skb);
+ return true;
+
+out_err:
+ ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+ return false;
+}
+
+static bool icmp_discard(struct sk_buff *skb)
+{
+ /* pretend it was a success */
+ return true;
+}
+
+/*
+ * Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+ struct icmphdr *icmph;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
+ bool success;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ struct sec_path *sp = skb_sec_path(skb);
+ int nh;
+
+ if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+ goto drop;
+
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*icmph));
+
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ skb_set_network_header(skb, nh);
+ }
+
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
+
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
+
+ if (!pskb_pull(skb, sizeof(*icmph)))
+ goto error;
+
+ icmph = icmp_hdr(skb);
+
+ ICMPMSGIN_INC_STATS_BH(net, icmph->type);
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES)
+ goto error;
+
+
+ /*
+ * Parse the ICMP message
+ */
+
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ /*
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored (we let user decide with a sysctl).
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ if ((icmph->type == ICMP_ECHO ||
+ icmph->type == ICMP_TIMESTAMP) &&
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
+ goto error;
+ }
+ if (icmph->type != ICMP_ECHO &&
+ icmph->type != ICMP_TIMESTAMP &&
+ icmph->type != ICMP_ADDRESS &&
+ icmph->type != ICMP_ADDRESSREPLY) {
+ goto error;
+ }
+ }
+
+ success = icmp_pointers[icmph->type].handler(skb);
+
+ if (success) {
+ consume_skb(skb);
+ return 0;
+ }
+
+drop:
+ kfree_skb(skb);
+ return 0;
+csum_error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
+error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ goto drop;
+}
+
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ int offset = iph->ihl<<2;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, offset, info);
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
+/*
+ * This table is the definition of how we handle ICMP.
+ */
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = {
+ .handler = ping_rcv,
+ },
+ [1] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [2] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_DEST_UNREACH] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_SOURCE_QUENCH] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_REDIRECT] = {
+ .handler = icmp_redirect,
+ .error = 1,
+ },
+ [6] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [7] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_ECHO] = {
+ .handler = icmp_echo,
+ },
+ [9] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [10] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_TIME_EXCEEDED] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_PARAMETERPROB] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_TIMESTAMP] = {
+ .handler = icmp_timestamp,
+ },
+ [ICMP_TIMESTAMPREPLY] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REQUEST] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REPLY] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_ADDRESS] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_ADDRESSREPLY] = {
+ .handler = icmp_discard,
+ },
+};
+
+static void __net_exit icmp_sk_exit(struct net *net)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
+ free_percpu(net->ipv4.icmp_sk);
+ net->ipv4.icmp_sk = NULL;
+}
+
+static int __net_init icmp_sk_init(struct net *net)
+{
+ int i, err;
+
+ net->ipv4.icmp_sk = alloc_percpu(struct sock *);
+ if (!net->ipv4.icmp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, net);
+ if (err < 0)
+ goto fail;
+
+ *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff/skb_shared_info struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+ }
+
+ /* Control parameters for ECHO replies. */
+ net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
+
+ /* Control parameter - ignore bogus broadcast responses? */
+ net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
+
+ /*
+ * Configurable global rate limit.
+ *
+ * ratelimit defines tokens/packet consumed for dst->rate_token
+ * bucket ratemask defines which icmp types are ratelimited by
+ * setting it's bit position.
+ *
+ * default:
+ * dest unreachable (3), source quench (4),
+ * time exceeded (11), parameter problem (12)
+ */
+
+ net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
+ net->ipv4.sysctl_icmp_ratemask = 0x1818;
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
+ free_percpu(net->ipv4.icmp_sk);
+ return err;
+}
+
+static struct pernet_operations __net_initdata icmp_sk_ops = {
+ .init = icmp_sk_init,
+ .exit = icmp_sk_exit,
+};
+
+int __init icmp_init(void)
+{
+ return register_pernet_subsys(&icmp_sk_ops);
+}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 000000000..a3a697f5f
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2800 @@
+/*
+ * Linux NET3: Internet Group Management Protocol [IGMP]
+ *
+ * This code implements the IGMP protocol as defined in RFC1112. There has
+ * been a further revision of this protocol since which is now supported.
+ *
+ * If you have trouble with this module be careful what gcc you have used,
+ * the older version didn't come out right using gcc 2.5.8, the newer one
+ * seems to fall out with gcc 2.6.2.
+ *
+ * Authors:
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *
+ * Alan Cox : Added lots of __inline__ to optimise
+ * the memory usage of all the tiny little
+ * functions.
+ * Alan Cox : Dumped the header building experiment.
+ * Alan Cox : Minor tweaks ready for multicast routing
+ * and extended IGMP protocol.
+ * Alan Cox : Removed a load of inline directives. Gcc 2.5.8
+ * writes utterly bogus code otherwise (sigh)
+ * fixed IGMP loopback to behave in the manner
+ * desired by mrouted, fixed the fact it has been
+ * broken since 1.3.6 and cleaned up a few minor
+ * points.
+ *
+ * Chih-Jen Chang : Tried to revise IGMP to Version 2
+ * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ * The enhancements are mainly based on Steve Deering's
+ * ipmulti-3.5 source code.
+ * Chih-Jen Chang : Added the igmp_get_mrouter_info and
+ * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
+ * the mrouted version on that device.
+ * Chih-Jen Chang : Added the max_resp_time parameter to
+ * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
+ * to identify the multicast router version
+ * and do what the IGMP version 2 specified.
+ * Chih-Jen Chang : Added a timer to revert to IGMP V2 router
+ * Tsu-Sheng Tsao if the specified time expired.
+ * Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
+ * Alan Cox : Use GFP_ATOMIC in the right places.
+ * Christian Daudt : igmp timer wasn't set for local group
+ * memberships but was being deleted,
+ * which caused a "del_timer() called
+ * from %p with timer not initialized\n"
+ * message (960131).
+ * Christian Daudt : removed del_timer from
+ * igmp_timer_expire function (960205).
+ * Christian Daudt : igmp_heard_report now only calls
+ * igmp_timer_expire if tm->running is
+ * true (960216).
+ * Malcolm Beattie : ttl comparison wrong in igmp_rcv made
+ * igmp_heard_query never trigger. Expiry
+ * miscalculation fixed in igmp_heard_query
+ * and random() made to return unsigned to
+ * prevent negative expiry times.
+ * Alexey Kuznetsov: Wrong group leaving behaviour, backport
+ * fix from pending 2.1.x patches.
+ * Alan Cox: Forget to enable FDDI support earlier.
+ * Alexey Kuznetsov: Fixed leaving groups on device down.
+ * Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
+ * David L Stevens: IGMPv3 support, with help from
+ * Vinay Kulkarni
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <linux/pkt_sched.h>
+
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/inet_common.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS 20
+#define IP_MAX_MSF 10
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ)
+#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ)
+#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
+#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
+#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
+#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2
+
+
+#define IGMP_INITIAL_REPORT_DELAY (1)
+
+/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) \
+ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
+ ((in_dev)->mr_v1_seen && \
+ time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) \
+ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
+ ((in_dev)->mr_v2_seen && \
+ time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+ int interval_ms, interval_jiffies;
+
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+ else /* v3 */
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+ interval_jiffies = msecs_to_jiffies(interval_ms);
+
+ /* _timer functions can't handle a delay of 0 jiffies so ensure
+ * we always return a positive value.
+ */
+ if (interval_jiffies <= 0)
+ interval_jiffies = 1;
+ return interval_jiffies;
+}
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+ int sfcount, __be32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+ if (atomic_dec_and_test(&im->refcnt)) {
+ in_dev_put(im->interface);
+ kfree_rcu(im, rcu);
+ }
+}
+
+#define for_each_pmc_rcu(in_dev, pmc) \
+ for (pmc = rcu_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rcu_dereference(pmc->next_rcu))
+
+#define for_each_pmc_rtnl(in_dev, pmc) \
+ for (pmc = rtnl_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rtnl_dereference(pmc->next_rcu))
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ * Timer management
+ */
+
+static void igmp_stop_timer(struct ip_mc_list *im)
+{
+ spin_lock_bh(&im->lock);
+ if (del_timer(&im->timer))
+ atomic_dec(&im->refcnt);
+ im->tm_running = 0;
+ im->reporter = 0;
+ im->unsolicit_count = 0;
+ spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+ int tv = prandom_u32() % max_delay;
+
+ im->tm_running = 1;
+ if (!mod_timer(&im->timer, jiffies+tv+2))
+ atomic_inc(&im->refcnt);
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+ int tv = prandom_u32() % in_dev->mr_maxdelay;
+
+ in_dev->mr_gq_running = 1;
+ if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+ int tv = prandom_u32() % delay;
+
+ if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+ spin_lock_bh(&im->lock);
+ im->unsolicit_count = 0;
+ if (del_timer(&im->timer)) {
+ if ((long)(im->timer.expires-jiffies) < max_delay) {
+ add_timer(&im->timer);
+ im->tm_running = 1;
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ atomic_dec(&im->refcnt);
+ }
+ igmp_start_timer(im, max_delay);
+ spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ * Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
+{
+ switch (type) {
+ case IGMPV3_MODE_IS_INCLUDE:
+ case IGMPV3_MODE_IS_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ if (!(pmc->gsquery && !psf->sf_gsresp)) {
+ if (pmc->sfmode == MCAST_INCLUDE)
+ return 1;
+ /* don't include if this source is excluded
+ * in all filters
+ */
+ if (psf->sf_count[MCAST_INCLUDE])
+ return type == IGMPV3_MODE_IS_INCLUDE;
+ return pmc->sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ }
+ return 0;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ return psf->sf_count[MCAST_INCLUDE] != 0;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+ psf->sf_count[MCAST_INCLUDE])
+ return 0;
+ return pmc->sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ if (gdeleted || !psf->sf_crcount)
+ return 0;
+ return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ if (pmc->sfmode == MCAST_INCLUDE)
+ return gdeleted || (psf->sf_crcount && sdeleted);
+ return psf->sf_crcount && !gdeleted && !sdeleted;
+ }
+ return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+ struct ip_sf_list *psf;
+ int scount = 0;
+
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+ continue;
+ scount++;
+ }
+ return scount;
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
+{
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct iphdr *pip;
+ struct igmpv3_report *pig;
+ struct net *net = dev_net(dev);
+ struct flowi4 fl4;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ unsigned int size = mtu;
+
+ while (1) {
+ skb = alloc_skb(size + hlen + tlen,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (skb)
+ break;
+ size >>= 1;
+ if (size < 256)
+ return NULL;
+ }
+ skb->priority = TC_PRIO_CONTROL;
+
+ rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = dev;
+
+ skb->reserved_tailroom = skb_end_offset(skb) -
+ min(mtu, skb_end_offset(skb));
+ skb_reserve(skb, hlen);
+
+ skb_reset_network_header(skb);
+ pip = ip_hdr(skb);
+ skb_put(skb, sizeof(struct iphdr) + 4);
+
+ pip->version = 4;
+ pip->ihl = (sizeof(struct iphdr)+4)>>2;
+ pip->tos = 0xc0;
+ pip->frag_off = htons(IP_DF);
+ pip->ttl = 1;
+ pip->daddr = fl4.daddr;
+ pip->saddr = fl4.saddr;
+ pip->protocol = IPPROTO_IGMP;
+ pip->tot_len = 0; /* filled in later */
+ ip_select_ident(net, skb, NULL);
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
+
+ skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
+ skb_put(skb, sizeof(*pig));
+ pig = igmpv3_report_hdr(skb);
+ pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ pig->resv1 = 0;
+ pig->csum = 0;
+ pig->resv2 = 0;
+ pig->ngrec = 0;
+ return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+ struct igmphdr *pig = igmp_hdr(skb);
+ const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
+
+ pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
+
+ return ip_local_out(skb);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+ return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, struct igmpv3_grec **ppgr)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr;
+
+ if (!skb)
+ skb = igmpv3_newpack(dev, dev->mtu);
+ if (!skb)
+ return NULL;
+ pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+ pgr->grec_type = type;
+ pgr->grec_auxwords = 0;
+ pgr->grec_nsrcs = 0;
+ pgr->grec_mca = pmc->multiaddr;
+ pih = igmpv3_report_hdr(skb);
+ pih->ngrec = htons(ntohs(pih->ngrec)+1);
+ *ppgr = pgr;
+ return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, int gdeleted, int sdeleted)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr = NULL;
+ struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+ int scount, stotal, first, isquery, truncate;
+
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ return skb;
+
+ isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+ type == IGMPV3_MODE_IS_EXCLUDE;
+ truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+ type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+ stotal = scount = 0;
+
+ psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+ if (!*psf_list)
+ goto empty_source;
+
+ pih = skb ? igmpv3_report_hdr(skb) : NULL;
+
+ /* EX and TO_EX get a fresh packet, if needed */
+ if (truncate) {
+ if (pih && pih->ngrec &&
+ AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ }
+ }
+ first = 1;
+ psf_prev = NULL;
+ for (psf = *psf_list; psf; psf = psf_next) {
+ __be32 *psrc;
+
+ psf_next = psf->sf_next;
+
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ psf_prev = psf;
+ continue;
+ }
+
+ /* clear marks on query responses */
+ if (isquery)
+ psf->sf_gsresp = 0;
+
+ if (AVAILABLE(skb) < sizeof(__be32) +
+ first*sizeof(struct igmpv3_grec)) {
+ if (truncate && !first)
+ break; /* truncate these */
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ first = 1;
+ scount = 0;
+ }
+ if (first) {
+ skb = add_grhead(skb, pmc, type, &pgr);
+ first = 0;
+ }
+ if (!skb)
+ return NULL;
+ psrc = (__be32 *)skb_put(skb, sizeof(__be32));
+ *psrc = psf->sf_inaddr;
+ scount++; stotal++;
+ if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+ psf->sf_crcount--;
+ if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *psf_list = psf->sf_next;
+ kfree(psf);
+ continue;
+ }
+ }
+ psf_prev = psf;
+ }
+
+empty_source:
+ if (!stotal) {
+ if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES)
+ return skb;
+ if (pmc->crcount || isquery) {
+ /* make sure we have room for group header */
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
+ igmpv3_sendpack(skb);
+ skb = NULL; /* add_grhead will get a new one */
+ }
+ skb = add_grhead(skb, pmc, type, &pgr);
+ }
+ }
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+
+ if (isquery)
+ pmc->gsquery = 0; /* clear query state on report */
+ return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+ struct sk_buff *skb = NULL;
+ int type;
+
+ if (!pmc) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ rcu_read_unlock();
+ } else {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ if (!skb)
+ return 0;
+ return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+ struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+ psf_prev = NULL;
+ for (psf = *ppsf; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ if (psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *ppsf = psf->sf_next;
+ kfree(psf);
+ } else
+ psf_prev = psf;
+ }
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+ struct sk_buff *skb = NULL;
+ int type, dtype;
+
+ rcu_read_lock();
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+
+ /* deleted MCA's */
+ pmc_prev = NULL;
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
+ pmc_next = pmc->next;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ skb = add_grec(skb, pmc, dtype, 1, 1);
+ }
+ if (pmc->crcount) {
+ if (pmc->sfmode == MCAST_EXCLUDE) {
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ }
+ pmc->crcount--;
+ if (pmc->crcount == 0) {
+ igmpv3_clear_zeros(&pmc->tomb);
+ igmpv3_clear_zeros(&pmc->sources);
+ }
+ }
+ if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+ if (pmc_prev)
+ pmc_prev->next = pmc_next;
+ else
+ in_dev->mc_tomb = pmc_next;
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ } else
+ pmc_prev = pmc;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ /* change recs */
+ for_each_pmc_rcu(in_dev, pmc) {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_ALLOW_NEW_SOURCES;
+ } else {
+ type = IGMPV3_ALLOW_NEW_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ }
+ skb = add_grec(skb, pmc, type, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
+
+ /* filter mode changes */
+ if (pmc->crcount) {
+ if (pmc->sfmode == MCAST_EXCLUDE)
+ type = IGMPV3_CHANGE_TO_EXCLUDE;
+ else
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ pmc->crcount--;
+ }
+ spin_unlock_bh(&pmc->lock);
+ }
+ rcu_read_unlock();
+
+ if (!skb)
+ return;
+ (void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+ int type)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct igmphdr *ih;
+ struct rtable *rt;
+ struct net_device *dev = in_dev->dev;
+ struct net *net = dev_net(dev);
+ __be32 group = pmc ? pmc->multiaddr : 0;
+ struct flowi4 fl4;
+ __be32 dst;
+ int hlen, tlen;
+
+ if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+ return igmpv3_send_report(in_dev, pmc);
+ else if (type == IGMP_HOST_LEAVE_MESSAGE)
+ dst = IGMP_ALL_ROUTER;
+ else
+ dst = group;
+
+ rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt))
+ return -1;
+
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
+ if (!skb) {
+ ip_rt_put(rt);
+ return -1;
+ }
+ skb->priority = TC_PRIO_CONTROL;
+
+ skb_dst_set(skb, &rt->dst);
+
+ skb_reserve(skb, hlen);
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ skb_put(skb, sizeof(struct iphdr) + 4);
+
+ iph->version = 4;
+ iph->ihl = (sizeof(struct iphdr)+4)>>2;
+ iph->tos = 0xc0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = 1;
+ iph->daddr = dst;
+ iph->saddr = fl4.saddr;
+ iph->protocol = IPPROTO_IGMP;
+ ip_select_ident(net, skb, NULL);
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
+
+ ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ ih->type = type;
+ ih->code = 0;
+ ih->csum = 0;
+ ih->group = group;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+ return ip_local_out(skb);
+}
+
+static void igmp_gq_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ in_dev->mr_gq_running = 0;
+ igmpv3_send_report(in_dev, NULL);
+ in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ igmpv3_send_cr(in_dev);
+ if (in_dev->mr_ifc_count) {
+ in_dev->mr_ifc_count--;
+ igmp_ifc_start_timer(in_dev,
+ unsolicited_report_interval(in_dev));
+ }
+ in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ return;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+ struct ip_mc_list *im = (struct ip_mc_list *)data;
+ struct in_device *in_dev = im->interface;
+
+ spin_lock(&im->lock);
+ im->tm_running = 0;
+
+ if (im->unsolicit_count) {
+ im->unsolicit_count--;
+ igmp_start_timer(im, unsolicited_report_interval(in_dev));
+ }
+ im->reporter = 1;
+ spin_unlock(&im->lock);
+
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+ ip_ma_put(im);
+}
+
+/* mark EXCLUDE-mode sources */
+static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+ struct ip_sf_list *psf;
+ int i, scount;
+
+ scount = 0;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i = 0; i < nsrcs; i++) {
+ /* skip inactive filters */
+ if (psf->sf_count[MCAST_INCLUDE] ||
+ pmc->sfcount[MCAST_EXCLUDE] !=
+ psf->sf_count[MCAST_EXCLUDE])
+ break;
+ if (srcs[i] == psf->sf_inaddr) {
+ scount++;
+ break;
+ }
+ }
+ }
+ pmc->gsquery = 0;
+ if (scount == nsrcs) /* all sources excluded */
+ return 0;
+ return 1;
+}
+
+static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+ struct ip_sf_list *psf;
+ int i, scount;
+
+ if (pmc->sfmode == MCAST_EXCLUDE)
+ return igmp_xmarksources(pmc, nsrcs, srcs);
+
+ /* mark INCLUDE-mode sources */
+ scount = 0;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i = 0; i < nsrcs; i++)
+ if (srcs[i] == psf->sf_inaddr) {
+ psf->sf_gsresp = 1;
+ scount++;
+ break;
+ }
+ }
+ if (!scount) {
+ pmc->gsquery = 0;
+ return 0;
+ }
+ pmc->gsquery = 1;
+ return 1;
+}
+
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
+{
+ struct ip_mc_list *im;
+
+ /* Timers are only set for non-local groups */
+
+ if (group == IGMP_ALL_HOSTS)
+ return false;
+
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == group) {
+ igmp_stop_timer(im);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+ int len)
+{
+ struct igmphdr *ih = igmp_hdr(skb);
+ struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
+ struct ip_mc_list *im;
+ __be32 group = ih->group;
+ int max_delay;
+ int mark = 0;
+
+
+ if (len == 8) {
+ if (ih->code == 0) {
+ /* Alas, old v1 router presents here. */
+
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+ in_dev->mr_v1_seen = jiffies +
+ IGMP_V1_ROUTER_PRESENT_TIMEOUT;
+ group = 0;
+ } else {
+ /* v2 router present */
+ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+ in_dev->mr_v2_seen = jiffies +
+ IGMP_V2_ROUTER_PRESENT_TIMEOUT;
+ }
+ /* cancel the interface change timer */
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ /* clear deleted report items */
+ igmpv3_clear_delrec(in_dev);
+ } else if (len < 12) {
+ return true; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
+ } else { /* v3 */
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+ return true;
+
+ ih3 = igmpv3_query_hdr(skb);
+ if (ih3->nsrcs) {
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ + ntohs(ih3->nsrcs)*sizeof(__be32)))
+ return true;
+ ih3 = igmpv3_query_hdr(skb);
+ }
+
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
+ in_dev->mr_maxdelay = max_delay;
+ if (ih3->qrv)
+ in_dev->mr_qrv = ih3->qrv;
+ if (!group) { /* general query */
+ if (ih3->nsrcs)
+ return true; /* no sources allowed */
+ igmp_gq_start_timer(in_dev);
+ return false;
+ }
+ /* mark sources to include, if group & source-specific */
+ mark = ih3->nsrcs != 0;
+ }
+
+ /*
+ * - Start the timers in all of our membership records
+ * that the query applies to for the interface on
+ * which the query arrived excl. those that belong
+ * to a "local" group (224.0.0.X)
+ * - For timers already running check if they need to
+ * be reset.
+ * - Use the igmp->igmp_code field as the maximum
+ * delay possible
+ */
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
+ int changed;
+
+ if (group && group != im->multiaddr)
+ continue;
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&im->lock);
+ if (im->tm_running)
+ im->gsquery = im->gsquery && mark;
+ else
+ im->gsquery = mark;
+ changed = !im->gsquery ||
+ igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+ spin_unlock_bh(&im->lock);
+ if (changed)
+ igmp_mod_timer(im, max_delay);
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/* called in rcu_read_lock() section */
+int igmp_rcv(struct sk_buff *skb)
+{
+ /* This basically follows the spec line by line -- see RFC1112 */
+ struct igmphdr *ih;
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ int len = skb->len;
+ bool dropped = true;
+
+ if (!in_dev)
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+ goto drop;
+
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
+
+ ih = igmp_hdr(skb);
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ dropped = igmp_heard_query(in_dev, skb, len);
+ break;
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ /* Is it our report looped back? */
+ if (rt_is_output_route(skb_rtable(skb)))
+ break;
+ /* don't rely on MC router hearing unicast reports */
+ if (skb->pkt_type == PACKET_MULTICAST ||
+ skb->pkt_type == PACKET_BROADCAST)
+ dropped = igmp_heard_report(in_dev, ih->group);
+ break;
+ case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+ return pim_rcv_v1(skb);
+#endif
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ case IGMP_DVMRP:
+ case IGMP_TRACE:
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_MTRACE:
+ case IGMP_MTRACE_RESP:
+ break;
+ default:
+ break;
+ }
+
+drop:
+ if (dropped)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ return 0;
+}
+
+#endif
+
+
+/*
+ * Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+ We will get multicast token leakage, when IFF_MULTICAST
+ is changed. This check should be done in ndo_set_rx_mode
+ routine. Something sort of:
+ if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+ --ANK
+ */
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_add(dev, buf);
+}
+
+/*
+ * Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_del(dev, buf);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+ struct ip_mc_list *pmc;
+
+ /* this is an "ip_mc_list" for convenience; only the fields below
+ * are actually used. In particular, the refcnt and users are not
+ * used for management of the delete list. Using the same structure
+ * for deleted items allows change reports to use common code with
+ * non-deleted or query-response MCA's.
+ */
+ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+ if (!pmc)
+ return;
+ spin_lock_bh(&im->lock);
+ pmc->interface = im->interface;
+ in_dev_hold(in_dev);
+ pmc->multiaddr = im->multiaddr;
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->sfmode = im->sfmode;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ struct ip_sf_list *psf;
+
+ pmc->tomb = im->tomb;
+ pmc->sources = im->sources;
+ im->tomb = im->sources = NULL;
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = pmc->crcount;
+ }
+ spin_unlock_bh(&im->lock);
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc->next = in_dev->mc_tomb;
+ in_dev->mc_tomb = pmc;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+{
+ struct ip_mc_list *pmc, *pmc_prev;
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc_prev = NULL;
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
+ if (pmc->multiaddr == multiaddr)
+ break;
+ pmc_prev = pmc;
+ }
+ if (pmc) {
+ if (pmc_prev)
+ pmc_prev->next = pmc->next;
+ else
+ in_dev->mc_tomb = pmc->next;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+ if (pmc) {
+ for (psf = pmc->tomb; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+}
+
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *nextpmc;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc = in_dev->mc_tomb;
+ in_dev->mc_tomb = NULL;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ for (; pmc; pmc = nextpmc) {
+ nextpmc = pmc->next;
+ ip_mc_clear_src(pmc);
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+ /* clear dead sources, too */
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&pmc->lock);
+ psf = pmc->tomb;
+ pmc->tomb = NULL;
+ spin_unlock_bh(&pmc->lock);
+ for (; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ }
+ rcu_read_unlock();
+}
+#endif
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ int reporter;
+#endif
+
+ if (im->loaded) {
+ im->loaded = 0;
+ ip_mc_filter_del(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ reporter = im->reporter;
+ igmp_stop_timer(im);
+
+ if (!in_dev->dead) {
+ if (IGMP_V1_SEEN(in_dev))
+ return;
+ if (IGMP_V2_SEEN(in_dev)) {
+ if (reporter)
+ igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+ return;
+ }
+ /* IGMPv3 */
+ igmpv3_add_delrec(in_dev, im);
+
+ igmp_ifc_event(in_dev);
+ }
+#endif
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+
+ if (im->loaded == 0) {
+ im->loaded = 1;
+ ip_mc_filter_add(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ if (in_dev->dead)
+ return;
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+ spin_lock_bh(&im->lock);
+ igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ /* else, v3 */
+
+ im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ * Multicast list managers
+ */
+
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+ return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash;
+ u32 hash;
+
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ rcu_assign_pointer(mc_hash[hash], im);
+ return;
+ }
+
+ /* do not use a hash table for small number of items */
+ if (in_dev->mc_count < 4)
+ return;
+
+ mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+ GFP_KERNEL);
+ if (!mc_hash)
+ return;
+
+ for_each_pmc_rtnl(in_dev, im) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ RCU_INIT_POINTER(mc_hash[hash], im);
+ }
+
+ rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+ struct ip_mc_list *aux;
+
+ if (!mc_hash)
+ return;
+ mc_hash += ip_mc_hash(im);
+ while ((aux = rtnl_dereference(*mc_hash)) != im)
+ mc_hash = &aux->next_hash;
+ *mc_hash = im->next_hash;
+}
+
+
+/*
+ * A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+ struct ip_mc_list *im;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == addr) {
+ im->users++;
+ ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+ goto out;
+ }
+ }
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ im->users = 1;
+ im->interface = in_dev;
+ in_dev_hold(in_dev);
+ im->multiaddr = addr;
+ /* initial mode is (EX, empty) */
+ im->sfmode = MCAST_EXCLUDE;
+ im->sfcount[MCAST_EXCLUDE] = 1;
+ atomic_set(&im->refcnt, 1);
+ spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+ setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
+ im->unsolicit_count = sysctl_igmp_qrv;
+#endif
+
+ im->next_rcu = in_dev->mc_list;
+ in_dev->mc_count++;
+ rcu_assign_pointer(in_dev->mc_list, im);
+
+ ip_mc_hash_add(in_dev, im);
+
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+ igmp_group_added(im);
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+out:
+ return;
+}
+EXPORT_SYMBOL(ip_mc_inc_group);
+
+/*
+ * Resend IGMP JOIN report; used by netdev notifier.
+ */
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
+{
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_mc_list *im;
+ int type;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+
+ /* a failover is happening and switches
+ * must be notified immediately
+ */
+ if (IGMP_V1_SEEN(in_dev))
+ type = IGMP_HOST_MEMBERSHIP_REPORT;
+ else if (IGMP_V2_SEEN(in_dev))
+ type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+ else
+ type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ igmp_send_report(in_dev, im, type);
+ }
+#endif
+}
+
+/*
+ * A socket has left a multicast group on device dev
+ */
+
+void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
+{
+ struct ip_mc_list *i;
+ struct ip_mc_list __rcu **ip;
+
+ ASSERT_RTNL();
+
+ for (ip = &in_dev->mc_list;
+ (i = rtnl_dereference(*ip)) != NULL;
+ ip = &i->next_rcu) {
+ if (i->multiaddr == addr) {
+ if (--i->users == 0) {
+ ip_mc_hash_remove(in_dev, i);
+ *ip = i->next_rcu;
+ in_dev->mc_count--;
+ igmp_group_dropped(i);
+ ip_mc_clear_src(i);
+
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+
+ ip_ma_put(i);
+ return;
+ }
+ break;
+ }
+ }
+}
+EXPORT_SYMBOL(ip_mc_dec_group);
+
+/* Device changing type */
+
+void ip_mc_unmap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
+}
+
+void ip_mc_remap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_added(pmc);
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
+
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ in_dev->mr_gq_running = 0;
+ if (del_timer(&in_dev->mr_gq_timer))
+ __in_dev_put(in_dev);
+ igmpv3_clear_delrec(in_dev);
+#endif
+
+ ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+ ASSERT_RTNL();
+
+#ifdef CONFIG_IP_MULTICAST
+ setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
+ (unsigned long)in_dev);
+ setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
+ (unsigned long)in_dev);
+ in_dev->mr_qrv = sysctl_igmp_qrv;
+#endif
+
+ spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_qrv = sysctl_igmp_qrv;
+#endif
+ ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_added(pmc);
+}
+
+/*
+ * Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ /* Deactivate timers */
+ ip_mc_down(in_dev);
+
+ while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
+ in_dev->mc_list = i->next_rcu;
+ in_dev->mc_count--;
+
+ /* We've dropped the groups in ip_mc_down already */
+ ip_mc_clear_src(i);
+ ip_ma_put(i);
+ }
+}
+
+/* RTNL is locked */
+static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
+{
+ struct net_device *dev = NULL;
+ struct in_device *idev = NULL;
+
+ if (imr->imr_ifindex) {
+ idev = inetdev_by_index(net, imr->imr_ifindex);
+ return idev;
+ }
+ if (imr->imr_address.s_addr) {
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
+ if (!dev)
+ return NULL;
+ }
+
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net,
+ imr->imr_multiaddr.s_addr,
+ 0, 0, 0);
+ if (!IS_ERR(rt)) {
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ }
+ if (dev) {
+ imr->imr_ifindex = dev->ifindex;
+ idev = __in_dev_get_rtnl(dev);
+ }
+ return idev;
+}
+
+/*
+ * Join a socket to a group
+ */
+int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
+#ifdef CONFIG_IP_MULTICAST
+int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
+#endif
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+ __be32 *psfsrc)
+{
+ struct ip_sf_list *psf, *psf_prev;
+ int rv = 0;
+
+ psf_prev = NULL;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf || psf->sf_count[sfmode] == 0) {
+ /* source filter not found, or count wrong => bug */
+ return -ESRCH;
+ }
+ psf->sf_count[sfmode]--;
+ if (psf->sf_count[sfmode] == 0) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct in_device *in_dev = pmc->interface;
+#endif
+
+ /* no more filters for this source */
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+ if (psf->sf_oldin &&
+ !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+ psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ psf->sf_next = pmc->tomb;
+ pmc->tomb = psf;
+ rv = 1;
+ } else
+#endif
+ kfree(psf);
+ }
+ return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x) do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+ int sfcount, __be32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int changerec = 0;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ rcu_read_unlock();
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ if (!delta) {
+ err = -EINVAL;
+ if (!pmc->sfcount[sfmode])
+ goto out_unlock;
+ pmc->sfcount[sfmode]--;
+ }
+ err = 0;
+ for (i = 0; i < sfcount; i++) {
+ int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+ changerec |= rv > 0;
+ if (!err && rv < 0)
+ err = rv;
+ }
+ if (pmc->sfmode == MCAST_EXCLUDE &&
+ pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+ pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_sf_list *psf;
+#endif
+
+ /* filter mode change */
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(pmc->interface);
+ } else if (sf_setstate(pmc) || changerec) {
+ igmp_ifc_event(pmc->interface);
+#endif
+ }
+out_unlock:
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+ __be32 *psfsrc)
+{
+ struct ip_sf_list *psf, *psf_prev;
+
+ psf_prev = NULL;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf) {
+ psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+ if (!psf)
+ return -ENOBUFS;
+ psf->sf_inaddr = *psfsrc;
+ if (psf_prev) {
+ psf_prev->sf_next = psf;
+ } else
+ pmc->sources = psf;
+ }
+ psf->sf_count[sfmode]++;
+ if (psf->sf_count[sfmode] == 1) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ psf->sf_oldin = mca_xcount ==
+ psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf, *dpsf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+ int qrv = pmc->interface->mr_qrv;
+ int new_in, rv;
+
+ rv = 0;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+ if (new_in) {
+ if (!psf->sf_oldin) {
+ struct ip_sf_list *prev = NULL;
+
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
+ if (dpsf->sf_inaddr == psf->sf_inaddr)
+ break;
+ prev = dpsf;
+ }
+ if (dpsf) {
+ if (prev)
+ prev->sf_next = dpsf->sf_next;
+ else
+ pmc->tomb = dpsf->sf_next;
+ kfree(dpsf);
+ }
+ psf->sf_crcount = qrv;
+ rv++;
+ }
+ } else if (psf->sf_oldin) {
+
+ psf->sf_crcount = 0;
+ /*
+ * add or update "delete" records if an active filter
+ * is now inactive
+ */
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
+ if (dpsf->sf_inaddr == psf->sf_inaddr)
+ break;
+ if (!dpsf) {
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ if (!dpsf)
+ continue;
+ *dpsf = *psf;
+ /* pmc->lock held by callers */
+ dpsf->sf_next = pmc->tomb;
+ pmc->tomb = dpsf;
+ }
+ dpsf->sf_crcount = qrv;
+ rv++;
+ }
+ }
+ return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+ int sfcount, __be32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int isexclude;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ rcu_read_unlock();
+
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ isexclude = pmc->sfmode == MCAST_EXCLUDE;
+ if (!delta)
+ pmc->sfcount[sfmode]++;
+ err = 0;
+ for (i = 0; i < sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
+ if (err)
+ break;
+ }
+ if (err) {
+ int j;
+
+ if (!delta)
+ pmc->sfcount[sfmode]--;
+ for (j = 0; j < i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
+ } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_sf_list *psf;
+ in_dev = pmc->interface;
+#endif
+
+ /* filter mode change */
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ pmc->sfmode = MCAST_EXCLUDE;
+ else if (pmc->sfcount[MCAST_INCLUDE])
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ /* else no filters; keep old mode for reports */
+
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(in_dev);
+ } else if (sf_setstate(pmc)) {
+ igmp_ifc_event(in_dev);
+#endif
+ }
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf, *nextpsf;
+
+ for (psf = pmc->tomb; psf; psf = nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->tomb = NULL;
+ for (psf = pmc->sources; psf; psf = nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+
+/* Join a multicast group
+ */
+
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ __be32 addr = imr->imr_multiaddr.s_addr;
+ struct ip_mc_socklist *iml, *i;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ int ifindex;
+ int count = 0;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ in_dev = ip_mc_find_dev(net, imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ err = -EADDRINUSE;
+ ifindex = imr->imr_ifindex;
+ for_each_pmc_rtnl(inet, i) {
+ if (i->multi.imr_multiaddr.s_addr == addr &&
+ i->multi.imr_ifindex == ifindex)
+ goto done;
+ count++;
+ }
+ err = -ENOBUFS;
+ if (count >= sysctl_igmp_max_memberships)
+ goto done;
+ iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+ if (!iml)
+ goto done;
+
+ memcpy(&iml->multi, imr, sizeof(*imr));
+ iml->next_rcu = inet->mc_list;
+ iml->sflist = NULL;
+ iml->sfmode = MCAST_EXCLUDE;
+ rcu_assign_pointer(inet->mc_list, iml);
+ ip_mc_inc_group(in_dev, addr);
+ err = 0;
+done:
+ return err;
+}
+EXPORT_SYMBOL(ip_mc_join_group);
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+ struct in_device *in_dev)
+{
+ struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
+ int err;
+
+ if (!psf) {
+ /* any-source empty exclude case */
+ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, 0, NULL, 0);
+ }
+ err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psf, rcu);
+ return err;
+}
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+ struct ip_mc_socklist __rcu **imlp;
+ struct in_device *in_dev;
+ struct net *net = sock_net(sk);
+ __be32 group = imr->imr_multiaddr.s_addr;
+ u32 ifindex;
+ int ret = -EADDRNOTAVAIL;
+
+ ASSERT_RTNL();
+
+ in_dev = ip_mc_find_dev(net, imr);
+ if (!in_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ ifindex = imr->imr_ifindex;
+ for (imlp = &inet->mc_list;
+ (iml = rtnl_dereference(*imlp)) != NULL;
+ imlp = &iml->next_rcu) {
+ if (iml->multi.imr_multiaddr.s_addr != group)
+ continue;
+ if (ifindex) {
+ if (iml->multi.imr_ifindex != ifindex)
+ continue;
+ } else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
+ iml->multi.imr_address.s_addr)
+ continue;
+
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+
+ *imlp = iml->next_rcu;
+
+ ip_mc_dec_group(in_dev, group);
+
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
+ return 0;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ip_mc_leave_group);
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+ ip_mreq_source *mreqs, int ifindex)
+{
+ int err;
+ struct ip_mreqn imr;
+ __be32 addr = mreqs->imr_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
+ int leavegroup = 0;
+ int i, j, rv;
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ ASSERT_RTNL();
+
+ imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+ imr.imr_address.s_addr = mreqs->imr_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(net, &imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if ((pmc->multi.imr_multiaddr.s_addr ==
+ imr.imr_multiaddr.s_addr) &&
+ (pmc->multi.imr_ifindex == imr.imr_ifindex))
+ break;
+ }
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
+ goto done;
+ }
+ /* if a source filter was set, must be the same mode as before */
+ if (pmc->sflist) {
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
+ goto done;
+ }
+ } else if (pmc->sfmode != omode) {
+ /* allow mode switches for empty-set filters */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+ NULL, 0);
+ pmc->sfmode = omode;
+ }
+
+ psl = rtnl_dereference(pmc->sflist);
+ if (!add) {
+ if (!psl)
+ goto done; /* err = -EADDRNOTAVAIL */
+ rv = !0;
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__be32));
+ if (rv == 0)
+ break;
+ }
+ if (rv) /* source not found */
+ goto done; /* err = -EADDRNOTAVAIL */
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+ leavegroup = 1;
+ goto done;
+ }
+
+ /* update the interface filter */
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+
+ for (j = i+1; j < psl->sl_count; j++)
+ psl->sl_addr[j-1] = psl->sl_addr[j];
+ psl->sl_count--;
+ err = 0;
+ goto done;
+ }
+ /* else, add a new source to the filter */
+
+ if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ if (!psl || psl->sl_count == psl->sl_max) {
+ struct ip_sf_socklist *newpsl;
+ int count = IP_SFBLOCK;
+
+ if (psl)
+ count += psl->sl_max;
+ newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = count;
+ newpsl->sl_count = count - IP_SFBLOCK;
+ if (psl) {
+ for (i = 0; i < psl->sl_count; i++)
+ newpsl->sl_addr[i] = psl->sl_addr[i];
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
+ }
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ psl = newpsl;
+ }
+ rv = 1; /* > 0 for insert logic below if sl_count is 0 */
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__be32));
+ if (rv == 0)
+ break;
+ }
+ if (rv == 0) /* address already there is an error */
+ goto done;
+ for (j = psl->sl_count-1; j >= i; j--)
+ psl->sl_addr[j+1] = psl->sl_addr[j];
+ psl->sl_addr[i] = mreqs->imr_sourceaddr;
+ psl->sl_count++;
+ err = 0;
+ /* update the interface list */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+done:
+ if (leavegroup)
+ err = ip_mc_leave_group(sk, &imr);
+ return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+ int err = 0;
+ struct ip_mreqn imr;
+ __be32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *newpsl, *psl;
+ struct net *net = sock_net(sk);
+ int leavegroup = 0;
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+ if (msf->imsf_fmode != MCAST_INCLUDE &&
+ msf->imsf_fmode != MCAST_EXCLUDE)
+ return -EINVAL;
+
+ ASSERT_RTNL();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(net, &imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
+ goto done;
+ }
+ if (msf->imsf_numsrc) {
+ newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
+ GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+ memcpy(newpsl->sl_addr, msf->imsf_slist,
+ msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+ if (err) {
+ sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ goto done;
+ }
+ } else {
+ newpsl = NULL;
+ (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, 0, NULL, 0);
+ }
+ psl = rtnl_dereference(pmc->sflist);
+ if (psl) {
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
+ } else
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ 0, NULL, 0);
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ pmc->sfmode = msf->imsf_fmode;
+ err = 0;
+done:
+ if (leavegroup)
+ err = ip_mc_leave_group(sk, &imr);
+ return err;
+}
+
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+ struct ip_msfilter __user *optval, int __user *optlen)
+{
+ int err, len, count, copycount;
+ struct ip_mreqn imr;
+ __be32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ rtnl_lock();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = 0;
+ in_dev = ip_mc_find_dev(net, &imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ msf->imsf_fmode = pmc->sfmode;
+ psl = rtnl_dereference(pmc->sflist);
+ rtnl_unlock();
+ if (!psl) {
+ len = 0;
+ count = 0;
+ } else {
+ count = psl->sl_count;
+ }
+ copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+ len = copycount * sizeof(psl->sl_addr[0]);
+ msf->imsf_numsrc = count;
+ if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ if (len &&
+ copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ return -EFAULT;
+ return 0;
+done:
+ rtnl_unlock();
+ return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+ struct group_filter __user *optval, int __user *optlen)
+{
+ int err, i, count, copycount;
+ struct sockaddr_in *psin;
+ __be32 addr;
+ struct ip_mc_socklist *pmc;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ addr = psin->sin_addr.s_addr;
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ rtnl_lock();
+
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == addr &&
+ pmc->multi.imr_ifindex == gsf->gf_interface)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ gsf->gf_fmode = pmc->sfmode;
+ psl = rtnl_dereference(pmc->sflist);
+ rtnl_unlock();
+ count = psl ? psl->sl_count : 0;
+ copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ gsf->gf_numsrc = count;
+ if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ for (i = 0; i < copycount; i++) {
+ struct sockaddr_storage ss;
+
+ psin = (struct sockaddr_in *)&ss;
+ memset(&ss, 0, sizeof(ss));
+ psin->sin_family = AF_INET;
+ psin->sin_addr.s_addr = psl->sl_addr[i];
+ if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ return -EFAULT;
+ }
+ return 0;
+done:
+ rtnl_unlock();
+ return err;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *pmc;
+ struct ip_sf_socklist *psl;
+ int i;
+ int ret;
+
+ ret = 1;
+ if (!ipv4_is_multicast(loc_addr))
+ goto out;
+
+ rcu_read_lock();
+ for_each_pmc_rcu(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+ pmc->multi.imr_ifindex == dif)
+ break;
+ }
+ ret = inet->mc_all;
+ if (!pmc)
+ goto unlock;
+ psl = rcu_dereference(pmc->sflist);
+ ret = (pmc->sfmode == MCAST_EXCLUDE);
+ if (!psl)
+ goto unlock;
+
+ for (i = 0; i < psl->sl_count; i++) {
+ if (psl->sl_addr[i] == rmt_addr)
+ break;
+ }
+ ret = 0;
+ if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+ goto unlock;
+ if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+ goto unlock;
+ ret = 1;
+unlock:
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/*
+ * A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+ struct net *net = sock_net(sk);
+
+ if (!inet->mc_list)
+ return;
+
+ rtnl_lock();
+ while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
+ struct in_device *in_dev;
+
+ inet->mc_list = iml->next_rcu;
+ in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
+ }
+ rtnl_unlock();
+}
+
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+{
+ struct ip_mc_list *im;
+ struct ip_mc_list __rcu **mc_hash;
+ struct ip_sf_list *psf;
+ int rv = 0;
+
+ mc_hash = rcu_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+ for (im = rcu_dereference(mc_hash[hash]);
+ im != NULL;
+ im = rcu_dereference(im->next_hash)) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ }
+ if (im && proto == IPPROTO_IGMP) {
+ rv = 1;
+ } else if (im) {
+ if (src_addr) {
+ for (psf = im->sources; psf; psf = psf->sf_next) {
+ if (psf->sf_inaddr == src_addr)
+ break;
+ }
+ if (psf)
+ rv = psf->sf_count[MCAST_INCLUDE] ||
+ psf->sf_count[MCAST_EXCLUDE] !=
+ im->sfcount[MCAST_EXCLUDE];
+ else
+ rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ } else
+ rv = 1; /* unspecified source; tentatively allow */
+ }
+ return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct in_device *in_dev;
+};
+
+#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_mc_list *im = NULL;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ state->in_dev = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rcu(state->dev);
+ if (!in_dev)
+ continue;
+ im = rcu_dereference(in_dev->mc_list);
+ if (im) {
+ state->in_dev = in_dev;
+ break;
+ }
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ im = rcu_dereference(im->next_rcu);
+ while (!im) {
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->in_dev = NULL;
+ break;
+ }
+ state->in_dev = __in_dev_get_rcu(state->dev);
+ if (!state->in_dev)
+ continue;
+ im = rcu_dereference(state->in_dev->mc_list);
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_mc_list *im = igmp_mc_get_first(seq);
+ if (im)
+ while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+ --pos;
+ return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_mc_list *im;
+ if (v == SEQ_START_TOKEN)
+ im = igmp_mc_get_first(seq);
+ else
+ im = igmp_mc_get_next(seq, v);
+ ++*pos;
+ return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ state->in_dev = NULL;
+ state->dev = NULL;
+ rcu_read_unlock();
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
+ else {
+ struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ char *querier;
+ long delta;
+
+#ifdef CONFIG_IP_MULTICAST
+ querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+ IGMP_V2_SEEN(state->in_dev) ? "V2" :
+ "V3";
+#else
+ querier = "NONE";
+#endif
+
+ if (rcu_access_pointer(state->in_dev->mc_list) == im) {
+ seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+ state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
+ }
+
+ delta = im->timer.expires - jiffies;
+ seq_printf(seq,
+ "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
+ im->multiaddr, im->users,
+ im->tm_running,
+ im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
+ im->reporter);
+ }
+ return 0;
+}
+
+static const struct seq_operations igmp_mc_seq_ops = {
+ .start = igmp_mc_seq_start,
+ .next = igmp_mc_seq_next,
+ .stop = igmp_mc_seq_stop,
+ .show = igmp_mc_seq_show,
+};
+
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &igmp_mc_seq_ops,
+ sizeof(struct igmp_mc_iter_state));
+}
+
+static const struct file_operations igmp_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct igmp_mcf_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct in_device *idev;
+ struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_sf_list *psf = NULL;
+ struct ip_mc_list *im = NULL;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ state->idev = NULL;
+ state->im = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct in_device *idev;
+ idev = __in_dev_get_rcu(state->dev);
+ if (unlikely(!idev))
+ continue;
+ im = rcu_dereference(idev->mc_list);
+ if (likely(im)) {
+ spin_lock_bh(&im->lock);
+ psf = im->sources;
+ if (likely(psf)) {
+ state->im = im;
+ state->idev = idev;
+ break;
+ }
+ spin_unlock_bh(&im->lock);
+ }
+ }
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ psf = psf->sf_next;
+ while (!psf) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = state->im->next;
+ while (!state->im) {
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->idev = NULL;
+ goto out;
+ }
+ state->idev = __in_dev_get_rcu(state->dev);
+ if (!state->idev)
+ continue;
+ state->im = rcu_dereference(state->idev->mc_list);
+ }
+ if (!state->im)
+ break;
+ spin_lock_bh(&state->im->lock);
+ psf = state->im->sources;
+ }
+out:
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+ if (psf)
+ while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+ --pos;
+ return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_sf_list *psf;
+ if (v == SEQ_START_TOKEN)
+ psf = igmp_mcf_get_first(seq);
+ else
+ psf = igmp_mcf_get_next(seq, v);
+ ++*pos;
+ return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+ if (likely(state->im)) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = NULL;
+ }
+ state->idev = NULL;
+ state->dev = NULL;
+ rcu_read_unlock();
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+ struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Idx Device MCA SRC INC EXC\n");
+ } else {
+ seq_printf(seq,
+ "%3d %6.6s 0x%08x "
+ "0x%08x %6lu %6lu\n",
+ state->dev->ifindex, state->dev->name,
+ ntohl(state->im->multiaddr),
+ ntohl(psf->sf_inaddr),
+ psf->sf_count[MCAST_INCLUDE],
+ psf->sf_count[MCAST_EXCLUDE]);
+ }
+ return 0;
+}
+
+static const struct seq_operations igmp_mcf_seq_ops = {
+ .start = igmp_mcf_seq_start,
+ .next = igmp_mcf_seq_next,
+ .stop = igmp_mcf_seq_stop,
+ .show = igmp_mcf_seq_show,
+};
+
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &igmp_mcf_seq_ops,
+ sizeof(struct igmp_mcf_iter_state));
+}
+
+static const struct file_operations igmp_mcf_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mcf_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init igmp_net_init(struct net *net)
+{
+ struct proc_dir_entry *pde;
+ int err;
+
+ pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
+ if (!pde)
+ goto out_igmp;
+ pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+ &igmp_mcf_seq_fops);
+ if (!pde)
+ goto out_mcfilter;
+ err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
+ SOCK_DGRAM, 0, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
+ err);
+ goto out_sock;
+ }
+
+ return 0;
+
+out_sock:
+ remove_proc_entry("mcfilter", net->proc_net);
+out_mcfilter:
+ remove_proc_entry("igmp", net->proc_net);
+out_igmp:
+ return -ENOMEM;
+}
+
+static void __net_exit igmp_net_exit(struct net *net)
+{
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
+ inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
+}
+
+static struct pernet_operations igmp_net_ops = {
+ .init = igmp_net_init,
+ .exit = igmp_net_exit,
+};
+#endif
+
+static int igmp_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev)
+ ip_mc_rejoin_groups(in_dev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+ .notifier_call = igmp_netdev_event,
+};
+
+int __init igmp_mc_init(void)
+{
+#if defined(CONFIG_PROC_FS)
+ int err;
+
+ err = register_pernet_subsys(&igmp_net_ops);
+ if (err)
+ return err;
+ err = register_netdevice_notifier(&igmp_notifier);
+ if (err)
+ goto reg_notif_fail;
+ return 0;
+
+reg_notif_fail:
+ unregister_pernet_subsys(&igmp_net_ops);
+ return err;
+#else
+ return register_netdevice_notifier(&igmp_notifier);
+#endif
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000..8976ca423
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,978 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET connection oriented protocols.
+ *
+ * Authors: See the TCP sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#include <net/tcp.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+void inet_get_local_port_range(struct net *net, int *low, int *high)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+
+ *low = net->ipv4.ip_local_ports.range[0];
+ *high = net->ipv4.ip_local_ports.range[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+}
+EXPORT_SYMBOL(inet_get_local_port_range);
+
+int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb, bool relax)
+{
+ struct sock *sk2;
+ int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
+
+ /*
+ * Unlike other sk lookup places we do not check
+ * for sk_net here, since _all_ the socks listed
+ * in tb->owners list belong to the same net - the
+ * one this bucket belongs to.
+ */
+
+ sk_for_each_bound(sk2, &tb->owners) {
+ if (sk != sk2 &&
+ !inet_v6_ipv6only(sk2) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ break;
+ }
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ break;
+ }
+ }
+ }
+ return sk2 != NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret, attempts = 5;
+ struct net *net = sock_net(sk);
+ int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
+
+ local_bh_disable();
+ if (!snum) {
+ int remaining, rover, low, high;
+
+again:
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+ smallest_rover = rover = prandom_u32() % remaining + low;
+
+ smallest_size = -1;
+ do {
+ if (inet_is_local_reserved_port(net, rover))
+ goto next_nolock;
+ head = &hashinfo->bhash[inet_bhashfn(net, rover,
+ hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == rover) {
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
+ (tb->num_owners < smallest_size || smallest_size == -1)) {
+ smallest_size = tb->num_owners;
+ smallest_rover = rover;
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+ !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = smallest_rover;
+ goto tb_found;
+ }
+ }
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = rover;
+ goto tb_found;
+ }
+ goto next;
+ }
+ break;
+ next:
+ spin_unlock(&head->lock);
+ next_nolock:
+ if (++rover > high)
+ rover = low;
+ } while (--remaining > 0);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
+ ret = 1;
+ if (remaining <= 0) {
+ if (smallest_size != -1) {
+ snum = smallest_rover;
+ goto have_snum;
+ }
+ goto fail;
+ }
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
+ snum = rover;
+ } else {
+have_snum:
+ head = &hashinfo->bhash[inet_bhashfn(net, snum,
+ hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == snum)
+ goto tb_found;
+ }
+ tb = NULL;
+ goto tb_not_found;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE)
+ goto success;
+
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+ smallest_size == -1) {
+ goto success;
+ } else {
+ ret = 1;
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+ smallest_size != -1 && --attempts >= 0) {
+ spin_unlock(&head->lock);
+ goto again;
+ }
+
+ goto fail_unlock;
+ }
+ }
+ }
+tb_not_found:
+ ret = 1;
+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+ net, head, snum)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else
+ tb->fastreuseport = 0;
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ tb->fastreuseport = 0;
+ }
+success:
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+fail:
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+ timeo = schedule_timeout(timeo);
+ sched_annotate_sleep();
+ lock_sock(sk);
+ err = 0;
+ if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock *req;
+ struct sock *newsk;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out_err;
+
+ /* Find already established connection */
+ if (reqsk_queue_empty(queue)) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* If this is a non blocking socket don't sleep */
+ error = -EAGAIN;
+ if (!timeo)
+ goto out_err;
+
+ error = inet_csk_wait_for_connect(sk, timeo);
+ if (error)
+ goto out_err;
+ }
+ req = reqsk_queue_remove(queue);
+ newsk = req->sk;
+
+ sk_acceptq_removed(sk);
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ tcp_rsk(req)->tfo_listener &&
+ queue->fastopenq) {
+ spin_lock_bh(&queue->fastopenq->lock);
+ if (tcp_rsk(req)->tfo_listener) {
+ /* We are still waiting for the final ACK from 3WHS
+ * so can't free req now. Instead, we set req->sk to
+ * NULL to signify that the child socket is taken
+ * so reqsk_fastopen_remove() will free the req
+ * when 3WHS finishes (or is aborted).
+ */
+ req->sk = NULL;
+ req = NULL;
+ }
+ spin_unlock_bh(&queue->fastopenq->lock);
+ }
+out:
+ release_sock(sk);
+ if (req)
+ reqsk_put(req);
+ return newsk;
+out_err:
+ newsk = NULL;
+ req = NULL;
+ *err = error;
+ goto out;
+}
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+ void (*retransmit_handler)(unsigned long),
+ void (*delack_handler)(unsigned long),
+ void (*keepalive_handler)(unsigned long))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
+ (unsigned long)sk);
+ setup_timer(&icsk->icsk_delack_timer, delack_handler,
+ (unsigned long)sk);
+ setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
+ icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry *inet_csk_route_req(struct sock *sk,
+ struct flowi4 *fl4,
+ const struct request_sock *req)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct ip_options_rcu *opt = ireq->opt;
+ struct rtable *rt;
+
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num));
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+ struct sock *newsk,
+ const struct request_sock *req)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct inet_sock *newinet = inet_sk(newsk);
+ struct ip_options_rcu *opt;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ fl4 = &newinet->cork.fl.u.ip4;
+
+ rcu_read_lock();
+ opt = rcu_dereference(newinet->inet_opt);
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num));
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ rcu_read_unlock();
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
+static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
+ const u32 rnd, const u32 synq_hsize)
+{
+ return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) true
+#endif
+
+/* Note: this is temporary :
+ * req sock will no longer be in listener hash table
+*/
+struct request_sock *inet_csk_search_req(struct sock *sk,
+ const __be16 rport,
+ const __be32 raddr,
+ const __be32 laddr)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req;
+ u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries);
+
+ spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
+ for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (ireq->ir_rmt_port == rport &&
+ ireq->ir_rmt_addr == raddr &&
+ ireq->ir_loc_addr == laddr &&
+ AF_INET_FAMILY(req->rsk_ops->family)) {
+ atomic_inc(&req->rsk_refcnt);
+ WARN_ON(req->sk);
+ break;
+ }
+ }
+ spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return req;
+}
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ unsigned long timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
+ inet_rsk(req)->ir_rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+
+/* Decide when to expire the request and when to resend SYN-ACK */
+static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
+ const int max_retries,
+ const u8 rskq_defer_accept,
+ int *expire, int *resend)
+{
+ if (!rskq_defer_accept) {
+ *expire = req->num_timeout >= thresh;
+ *resend = 1;
+ return;
+ }
+ *expire = req->num_timeout >= thresh &&
+ (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
+ /*
+ * Do not resend while waiting for data after ACK,
+ * start to resend on end of deferring period to give
+ * last chance for data or ACK to create established socket.
+ */
+ *resend = !inet_rsk(req)->acked ||
+ req->num_timeout >= rskq_defer_accept - 1;
+}
+
+int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+{
+ int err = req->rsk_ops->rtx_syn_ack(parent, req);
+
+ if (!err)
+ req->num_retrans++;
+ return err;
+}
+EXPORT_SYMBOL(inet_rtx_syn_ack);
+
+/* return true if req was found in the syn_table[] */
+static bool reqsk_queue_unlink(struct request_sock_queue *queue,
+ struct request_sock *req)
+{
+ struct listen_sock *lopt = queue->listen_opt;
+ struct request_sock **prev;
+ bool found = false;
+
+ spin_lock(&queue->syn_wait_lock);
+
+ for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
+ prev = &(*prev)->dl_next) {
+ if (*prev == req) {
+ *prev = req->dl_next;
+ found = true;
+ break;
+ }
+ }
+
+ spin_unlock(&queue->syn_wait_lock);
+ if (del_timer(&req->rsk_timer))
+ reqsk_put(req);
+ return found;
+}
+
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+ if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ reqsk_put(req);
+ }
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
+static void reqsk_timer_handler(unsigned long data)
+{
+ struct request_sock *req = (struct request_sock *)data;
+ struct sock *sk_listener = req->rsk_listener;
+ struct inet_connection_sock *icsk = inet_csk(sk_listener);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct listen_sock *lopt = queue->listen_opt;
+ int qlen, expire = 0, resend = 0;
+ int max_retries, thresh;
+ u8 defer_accept;
+
+ if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
+ reqsk_put(req);
+ return;
+ }
+
+ max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ thresh = max_retries;
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 1 second, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ qlen = listen_sock_qlen(lopt);
+ if (qlen >> (lopt->max_qlen_log - 1)) {
+ int young = listen_sock_young(lopt) << 1;
+
+ while (thresh > 2) {
+ if (qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
+ }
+ }
+ defer_accept = READ_ONCE(queue->rskq_defer_accept);
+ if (defer_accept)
+ max_retries = defer_accept;
+ syn_ack_recalc(req, thresh, max_retries, defer_accept,
+ &expire, &resend);
+ req->rsk_ops->syn_ack_timeout(req);
+ if (!expire &&
+ (!resend ||
+ !inet_rtx_syn_ack(sk_listener, req) ||
+ inet_rsk(req)->acked)) {
+ unsigned long timeo;
+
+ if (req->num_timeout++ == 0)
+ atomic_inc(&lopt->young_dec);
+ timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+ return;
+ }
+ inet_csk_reqsk_queue_drop(sk_listener, req);
+ reqsk_put(req);
+}
+
+void reqsk_queue_hash_req(struct request_sock_queue *queue,
+ u32 hash, struct request_sock *req,
+ unsigned long timeout)
+{
+ struct listen_sock *lopt = queue->listen_opt;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ /* before letting lookups find us, make sure all req fields
+ * are committed to memory and refcnt initialized.
+ */
+ smp_wmb();
+ atomic_set(&req->rsk_refcnt, 2);
+ setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
+ req->rsk_hash = hash;
+
+ spin_lock(&queue->syn_wait_lock);
+ req->dl_next = lopt->syn_table[hash];
+ lopt->syn_table[hash] = req;
+ spin_unlock(&queue->syn_wait_lock);
+
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+}
+EXPORT_SYMBOL(reqsk_queue_hash_req);
+
+/**
+ * inet_csk_clone_lock - clone an inet socket, and lock its clone
+ * @sk: the socket to clone
+ * @req: request_sock
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+ const struct request_sock *req,
+ const gfp_t priority)
+{
+ struct sock *newsk = sk_clone_lock(sk, priority);
+
+ if (newsk) {
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+ newsk->sk_state = TCP_SYN_RECV;
+ newicsk->icsk_bind_hash = NULL;
+
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+ inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+ inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
+ newsk->sk_write_space = sk_stream_write_space;
+
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+
+ security_inet_csk_clone(newsk, req);
+ }
+ return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+ WARN_ON(sk->sk_state != TCP_CLOSE);
+ WARN_ON(!sock_flag(sk, SOCK_DEAD));
+
+ /* It cannot be in hash table! */
+ WARN_ON(!sk_unhashed(sk));
+
+ /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
+
+ sk->sk_prot->destroy(sk);
+
+ sk_stream_kill_queues(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ percpu_counter_dec(sk->sk_prot->orphan_count);
+ sock_put(sk);
+}
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ /* sk_clone_lock locked the socket and set refcnt to 2 */
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+ if (rc != 0)
+ return rc;
+
+ sk->sk_max_ack_backlog = 0;
+ sk->sk_ack_backlog = 0;
+ inet_csk_delack_init(sk);
+
+ /* There is race window here: we announce ourselves listening,
+ * but this transition is still not validated by get_port().
+ * It is OK, because this socket enters to hash table only
+ * after validation is complete.
+ */
+ sk->sk_state = TCP_LISTEN;
+ if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+ inet->inet_sport = htons(inet->inet_num);
+
+ sk_dst_reset(sk);
+ sk->sk_prot->hash(sk);
+
+ return 0;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ return -EADDRINUSE;
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock *acc_req;
+ struct request_sock *req;
+
+ /* make all the listen_opt local to us */
+ acc_req = reqsk_queue_yank_acceptq(queue);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ reqsk_queue_destroy(queue);
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ WARN_ON(sock_owned_by_user(child));
+ sock_hold(child);
+
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+ reqsk_put(req);
+ }
+ if (queue->fastopenq) {
+ /* Free all the reqs queued in rskq_rst_head. */
+ spin_lock_bh(&queue->fastopenq->lock);
+ acc_req = queue->fastopenq->rskq_rst_head;
+ queue->fastopenq->rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq->lock);
+ while ((req = acc_req) != NULL) {
+ acc_req = req->dl_next;
+ reqsk_put(req);
+ }
+ }
+ WARN_ON(sk->sk_ack_backlog);
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ const struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ sin->sin_port = inet->inet_dport;
+}
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
+
+#ifdef CONFIG_COMPAT
+int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_af_ops->compat_getsockopt)
+ return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+ optval, optlen);
+}
+EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
+
+int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_af_ops->compat_setsockopt)
+ return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+ optval, optlen);
+}
+EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
+#endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ fl4 = &fl->u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ rt = NULL;
+ if (rt)
+ sk_setup_caps(sk, &rt->dst);
+ rcu_read_unlock();
+
+ return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!dst) {
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+ if (!dst)
+ goto out;
+ }
+ dst->ops->update_pmtu(dst, sk, NULL, mtu);
+
+ dst = __sk_dst_check(sk, 0);
+ if (!dst)
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+ return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 000000000..4d32262c7
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,1165 @@
+/*
+ * inet_diag.c Module for monitoring INET transport protocols sockets.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/netlink.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+ const __be32 *saddr;
+ const __be32 *daddr;
+ u16 sport;
+ u16 dport;
+ u16 family;
+ u16 userlocks;
+};
+
+static DEFINE_MUTEX(inet_diag_table_mutex);
+
+static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
+{
+ if (!inet_diag_table[proto])
+ request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET, proto);
+
+ mutex_lock(&inet_diag_table_mutex);
+ if (!inet_diag_table[proto])
+ return ERR_PTR(-ENOENT);
+
+ return inet_diag_table[proto];
+}
+
+static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
+{
+ mutex_unlock(&inet_diag_table_mutex);
+}
+
+static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+{
+ r->idiag_family = sk->sk_family;
+
+ r->id.idiag_sport = htons(sk->sk_num);
+ r->id.idiag_dport = sk->sk_dport;
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ } else
+#endif
+ {
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = sk->sk_daddr;
+ }
+}
+
+static size_t inet_sk_attr_size(void)
+{
+ return nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ + nla_total_size(1) /* INET_DIAG_TOS */
+ + nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+ struct sk_buff *skb, const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct tcp_congestion_ops *ca_ops;
+ const struct inet_diag_handler *handler;
+ int ext = req->idiag_ext;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ void *info = NULL;
+
+ handler = inet_diag_table[req->sdiag_protocol];
+ BUG_ON(!handler);
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ BUG_ON(!sk_fullsock(sk));
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
+
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (r->idiag_family == AF_INET6) {
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
+ }
+#endif
+
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+ struct inet_diag_meminfo minfo = {
+ .idiag_rmem = sk_rmem_alloc_get(sk),
+ .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_tmem = sk_wmem_alloc_get(sk),
+ };
+
+ if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+ goto errout;
+ }
+
+ if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+ if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+ goto errout;
+
+ if (!icsk) {
+ handler->idiag_get_info(sk, r, NULL);
+ goto out;
+ }
+
+#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ r->idiag_timer = 1;
+ r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ r->idiag_timer = 4;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (timer_pending(&sk->sk_timer)) {
+ r->idiag_timer = 2;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ attr = nla_reserve(skb, INET_DIAG_INFO,
+ sizeof(struct tcp_info));
+ if (!attr)
+ goto errout;
+
+ info = nla_data(attr);
+ }
+
+ if (ext & (1 << (INET_DIAG_CONG - 1))) {
+ int err = 0;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops)
+ err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
+ rcu_read_unlock();
+ if (err < 0)
+ goto errout;
+ }
+
+ handler->idiag_get_info(sk, r, info);
+
+ if (sk->sk_state < TCP_TIME_WAIT) {
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ext, &attr, &info);
+ rcu_read_unlock();
+ if (sz && nla_put(skb, attr, sz, &info) < 0)
+ goto errout;
+ }
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
+
+static int inet_csk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
+ user_ns, portid, seq, nlmsg_flags, unlh);
+}
+
+static int inet_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+
+ tmo = tw->tw_timer.expires - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = tw->tw_substate;
+ r->idiag_timer = 3;
+ r->idiag_expires = jiffies_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
+ r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ const struct inet_diag_req_v2 *r,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return inet_twsk_diag_fill(sk, skb, portid, seq,
+ nlmsg_flags, unlh);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return inet_req_diag_fill(sk, skb, portid, seq,
+ nlmsg_flags, unlh);
+
+ return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
+ nlmsg_flags, unlh);
+}
+
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ err = -EINVAL;
+ if (req->sdiag_family == AF_INET)
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6)
+ sk = inet6_lookup(net, hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ else
+ goto out_nosk;
+
+ err = -ENOENT;
+ if (!sk)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+out_nosk:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
+
+static int inet_diag_get_exact(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ const struct inet_diag_handler *handler;
+ int err;
+
+ handler = inet_diag_lock_handler(req->sdiag_protocol);
+ if (IS_ERR(handler))
+ err = PTR_ERR(handler);
+ else
+ err = handler->dump_one(in_skb, nlh, req);
+ inet_diag_unlock_handler(handler);
+
+ return err;
+}
+
+static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __be32 w1, w2;
+ __be32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+static int inet_diag_bc_run(const struct nlattr *_bc,
+ const struct inet_diag_entry *entry)
+{
+ const void *bc = nla_data(_bc);
+ int len = nla_len(_bc);
+
+ while (len > 0) {
+ int yes = 1;
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_NOP:
+ break;
+ case INET_DIAG_BC_JMP:
+ yes = 0;
+ break;
+ case INET_DIAG_BC_S_GE:
+ yes = entry->sport >= op[1].no;
+ break;
+ case INET_DIAG_BC_S_LE:
+ yes = entry->sport <= op[1].no;
+ break;
+ case INET_DIAG_BC_D_GE:
+ yes = entry->dport >= op[1].no;
+ break;
+ case INET_DIAG_BC_D_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_AUTO:
+ yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+ break;
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND: {
+ const struct inet_diag_hostcond *cond;
+ const __be32 *addr;
+
+ cond = (const struct inet_diag_hostcond *)(op + 1);
+ if (cond->port != -1 &&
+ cond->port != (op->code == INET_DIAG_BC_S_COND ?
+ entry->sport : entry->dport)) {
+ yes = 0;
+ break;
+ }
+
+ if (op->code == INET_DIAG_BC_S_COND)
+ addr = entry->saddr;
+ else
+ addr = entry->daddr;
+
+ if (cond->family != AF_UNSPEC &&
+ cond->family != entry->family) {
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3,
+ cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+ if (bitstring_match(addr, cond->addr,
+ cond->prefix_len))
+ break;
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return len == 0;
+}
+
+/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
+ */
+static void entry_fill_addrs(struct inet_diag_entry *entry,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry->daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry->saddr = &sk->sk_rcv_saddr;
+ entry->daddr = &sk->sk_daddr;
+ }
+}
+
+int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
+
+ if (!bc)
+ return 1;
+
+ entry.family = sk->sk_family;
+ entry_fill_addrs(&entry, sk);
+ entry.sport = inet->inet_num;
+ entry.dport = ntohs(inet->inet_dport);
+ entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
+
+ return inet_diag_bc_run(bc, &entry);
+}
+EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+ while (len >= 0) {
+ const struct inet_diag_bc_op *op = bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4 || op->yes & 3)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ struct inet_diag_hostcond *cond;
+ int addr_len;
+
+ /* Check hostcond space. */
+ *min_len += sizeof(struct inet_diag_hostcond);
+ if (len < *min_len)
+ return false;
+ cond = (struct inet_diag_hostcond *)(op + 1);
+
+ /* Check address family and address length. */
+ switch (cond->family) {
+ case AF_UNSPEC:
+ addr_len = 0;
+ break;
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return false;
+ }
+ *min_len += addr_len;
+ if (len < *min_len)
+ return false;
+
+ /* Check prefix length (in bits) vs address length (in bytes). */
+ if (cond->prefix_len > 8 * addr_len)
+ return false;
+
+ return true;
+}
+
+/* Validate a port comparison operator. */
+static bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
+{
+ /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+ *min_len += sizeof(struct inet_diag_bc_op);
+ if (len < *min_len)
+ return false;
+ return true;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+ const void *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ int min_len = sizeof(struct inet_diag_bc_op);
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND:
+ if (!valid_hostcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_S_GE:
+ case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_GE:
+ case INET_DIAG_BC_D_LE:
+ if (!valid_port_comparison(bc, len, &min_len))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_JMP:
+ case INET_DIAG_BC_NOP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (op->code != INET_DIAG_BC_NOP) {
+ if (op->no < min_len || op->no > len + 4 || op->no & 3)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ }
+
+ if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
+ return -EINVAL;
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_csk_diag_dump(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_csk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
+ int j, s_j, reqnum, s_reqnum;
+ struct listen_sock *lopt;
+ int err = 0;
+
+ s_j = cb->args[3];
+ s_reqnum = cb->args[4];
+
+ if (s_j > 0)
+ s_j--;
+
+ entry.family = sk->sk_family;
+
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ lopt = icsk->icsk_accept_queue.listen_opt;
+ if (!lopt || !listen_sock_qlen(lopt))
+ goto out;
+
+ if (bc) {
+ entry.sport = inet->inet_num;
+ entry.userlocks = sk->sk_userlocks;
+ }
+
+ for (j = s_j; j < lopt->nr_table_entries; j++) {
+ struct request_sock *req, *head = lopt->syn_table[j];
+
+ reqnum = 0;
+ for (req = head; req; reqnum++, req = req->dl_next) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (reqnum < s_reqnum)
+ continue;
+ if (r->id.idiag_dport != ireq->ir_rmt_port &&
+ r->id.idiag_dport)
+ continue;
+
+ if (bc) {
+ /* Note: entry.sport and entry.userlocks are already set */
+ entry_fill_addrs(&entry, req_to_sk(req));
+ entry.dport = ntohs(ireq->ir_rmt_port);
+
+ if (!inet_diag_bc_run(bc, &entry))
+ continue;
+ }
+
+ err = inet_req_diag_fill(req_to_sk(req), skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb->nlh);
+ if (err < 0) {
+ cb->args[3] = j + 1;
+ cb->args[4] = reqnum;
+ goto out;
+ }
+ }
+
+ s_reqnum = 0;
+ }
+
+out:
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return err;
+}
+
+void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ struct net *net = sock_net(skb->sk);
+ int i, num, s_i, s_num;
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+
+ for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+
+ num = 0;
+ ilb = &hashinfo->listening_hash[i];
+ spin_lock_bh(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!(r->idiag_states & TCPF_LISTEN) ||
+ r->id.idiag_dport ||
+ cb->args[3] > 0)
+ goto syn_recv;
+
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(&ilb->lock);
+ goto done;
+ }
+
+syn_recv:
+ if (!(r->idiag_states & TCPF_SYN_RECV))
+ goto next_listen;
+
+ if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ spin_unlock_bh(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ ++num;
+ }
+ spin_unlock_bh(&ilb->lock);
+
+ s_num = 0;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+
+ num = 0;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state, res;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_substate : sk->sk_state;
+ if (!(r->idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_normal;
+
+ res = sk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh);
+ if (res < 0) {
+ spin_unlock_bh(lock);
+ goto done;
+ }
+next_normal:
+ ++num;
+ }
+
+ spin_unlock_bh(lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
+
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
+{
+ const struct inet_diag_handler *handler;
+ int err = 0;
+
+ handler = inet_diag_lock_handler(r->sdiag_protocol);
+ if (!IS_ERR(handler))
+ handler->dump(skb, cb, r, bc);
+ else
+ err = PTR_ERR(handler);
+ inet_diag_unlock_handler(handler);
+
+ return err ? : skb->len;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct nlattr *bc = NULL;
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+}
+
+static int inet_diag_type2proto(int type)
+{
+ switch (type) {
+ case TCPDIAG_GETSOCK:
+ return IPPROTO_TCP;
+ case DCCPDIAG_GETSOCK:
+ return IPPROTO_DCCP;
+ default:
+ return 0;
+ }
+}
+
+static int inet_diag_dump_compat(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct inet_diag_req_v2 req;
+ struct nlattr *bc = NULL;
+
+ req.sdiag_family = AF_UNSPEC; /* compatibility */
+ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, &req, bc);
+}
+
+static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh)
+{
+ struct inet_diag_req *rc = nlmsg_data(nlh);
+ struct inet_diag_req_v2 req;
+
+ req.sdiag_family = rc->idiag_family;
+ req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ return inet_diag_get_exact(in_skb, nlh, &req);
+}
+
+static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
+ nlmsg_len(nlh) < hdrlen)
+ return -EINVAL;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (nlmsg_attrlen(nlh, hdrlen)) {
+ struct nlattr *attr;
+
+ attr = nlmsg_find_attr(nlh, hdrlen,
+ INET_DIAG_REQ_BYTECODE);
+ if (!attr ||
+ nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+ inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+ return -EINVAL;
+ }
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
+ }
+ }
+
+ return inet_diag_get_exact_compat(skb, nlh);
+}
+
+static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (nlmsg_attrlen(h, hdrlen)) {
+ struct nlattr *attr;
+
+ attr = nlmsg_find_attr(h, hdrlen,
+ INET_DIAG_REQ_BYTECODE);
+ if (!attr ||
+ nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+ inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+ return -EINVAL;
+ }
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+
+ return inet_diag_get_exact(skb, h, nlmsg_data(h));
+}
+
+static const struct sock_diag_handler inet_diag_handler = {
+ .family = AF_INET,
+ .dump = inet_diag_handler_dump,
+};
+
+static const struct sock_diag_handler inet6_diag_handler = {
+ .family = AF_INET6,
+ .dump = inet_diag_handler_dump,
+};
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+ int err = -EINVAL;
+
+ if (type >= IPPROTO_MAX)
+ goto out;
+
+ mutex_lock(&inet_diag_table_mutex);
+ err = -EEXIST;
+ if (!inet_diag_table[type]) {
+ inet_diag_table[type] = h;
+ err = 0;
+ }
+ mutex_unlock(&inet_diag_table_mutex);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+
+ if (type >= IPPROTO_MAX)
+ return;
+
+ mutex_lock(&inet_diag_table_mutex);
+ inet_diag_table[type] = NULL;
+ mutex_unlock(&inet_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+ const int inet_diag_table_size = (IPPROTO_MAX *
+ sizeof(struct inet_diag_handler *));
+ int err = -ENOMEM;
+
+ inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
+ if (!inet_diag_table)
+ goto out;
+
+ err = sock_diag_register(&inet_diag_handler);
+ if (err)
+ goto out_free_nl;
+
+ err = sock_diag_register(&inet6_diag_handler);
+ if (err)
+ goto out_free_inet;
+
+ sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
+out:
+ return err;
+
+out_free_inet:
+ sock_diag_unregister(&inet_diag_handler);
+out_free_nl:
+ kfree(inet_diag_table);
+ goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+ sock_diag_unregister(&inet6_diag_handler);
+ sock_diag_unregister(&inet_diag_handler);
+ sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
+ kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
new file mode 100644
index 000000000..5e346a082
--- /dev/null
+++ b/net/ipv4/inet_fragment.c
@@ -0,0 +1,463 @@
+/*
+ * inet fragments management
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Pavel Emelyanov <xemul@openvz.org>
+ * Started as consolidation of ipv4/ip_fragment.c,
+ * ipv6/reassembly. and ipv6 nf conntrack reassembly
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+#define INETFRAGS_EVICT_BUCKETS 128
+#define INETFRAGS_EVICT_MAX 512
+
+/* don't rebuild inetfrag table with new secret more often than this */
+#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
+
+static unsigned int
+inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
+{
+ return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
+}
+
+static bool inet_frag_may_rebuild(struct inet_frags *f)
+{
+ return time_after(jiffies,
+ f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
+}
+
+static void inet_frag_secret_rebuild(struct inet_frags *f)
+{
+ int i;
+
+ write_seqlock_bh(&f->rnd_seqlock);
+
+ if (!inet_frag_may_rebuild(f))
+ goto out;
+
+ get_random_bytes(&f->rnd, sizeof(u32));
+
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb;
+ struct inet_frag_queue *q;
+ struct hlist_node *n;
+
+ hb = &f->hash[i];
+ spin_lock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(q, n, &hb->chain, list) {
+ unsigned int hval = inet_frag_hashfn(f, q);
+
+ if (hval != i) {
+ struct inet_frag_bucket *hb_dest;
+
+ hlist_del(&q->list);
+
+ /* Relink to new hash chain. */
+ hb_dest = &f->hash[hval];
+
+ /* This is the only place where we take
+ * another chain_lock while already holding
+ * one. As this will not run concurrently,
+ * we cannot deadlock on hb_dest lock below, if its
+ * already locked it will be released soon since
+ * other caller cannot be waiting for hb lock
+ * that we've taken above.
+ */
+ spin_lock_nested(&hb_dest->chain_lock,
+ SINGLE_DEPTH_NESTING);
+ hlist_add_head(&q->list, &hb_dest->chain);
+ spin_unlock(&hb_dest->chain_lock);
+ }
+ }
+ spin_unlock(&hb->chain_lock);
+ }
+
+ f->rebuild = false;
+ f->last_rebuild_jiffies = jiffies;
+out:
+ write_sequnlock_bh(&f->rnd_seqlock);
+}
+
+static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
+{
+ return q->net->low_thresh == 0 ||
+ frag_mem_limit(q->net) >= q->net->low_thresh;
+}
+
+static unsigned int
+inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
+{
+ struct inet_frag_queue *fq;
+ struct hlist_node *n;
+ unsigned int evicted = 0;
+ HLIST_HEAD(expired);
+
+evict_again:
+ spin_lock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
+ if (!inet_fragq_should_evict(fq))
+ continue;
+
+ if (!del_timer(&fq->timer)) {
+ /* q expiring right now thus increment its refcount so
+ * it won't be freed under us and wait until the timer
+ * has finished executing then destroy it
+ */
+ atomic_inc(&fq->refcnt);
+ spin_unlock(&hb->chain_lock);
+ del_timer_sync(&fq->timer);
+ inet_frag_put(fq, f);
+ goto evict_again;
+ }
+
+ fq->flags |= INET_FRAG_EVICTED;
+ hlist_del(&fq->list);
+ hlist_add_head(&fq->list, &expired);
+ ++evicted;
+ }
+
+ spin_unlock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &expired, list)
+ f->frag_expire((unsigned long) fq);
+
+ return evicted;
+}
+
+static void inet_frag_worker(struct work_struct *work)
+{
+ unsigned int budget = INETFRAGS_EVICT_BUCKETS;
+ unsigned int i, evicted = 0;
+ struct inet_frags *f;
+
+ f = container_of(work, struct inet_frags, frags_work);
+
+ BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
+
+ local_bh_disable();
+
+ for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+ evicted += inet_evict_bucket(f, &f->hash[i]);
+ i = (i + 1) & (INETFRAGS_HASHSZ - 1);
+ if (evicted > INETFRAGS_EVICT_MAX)
+ break;
+ }
+
+ f->next_bucket = i;
+
+ local_bh_enable();
+
+ if (f->rebuild && inet_frag_may_rebuild(f))
+ inet_frag_secret_rebuild(f);
+}
+
+static void inet_frag_schedule_worker(struct inet_frags *f)
+{
+ if (unlikely(!work_pending(&f->frags_work)))
+ schedule_work(&f->frags_work);
+}
+
+int inet_frags_init(struct inet_frags *f)
+{
+ int i;
+
+ INIT_WORK(&f->frags_work, inet_frag_worker);
+
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb = &f->hash[i];
+
+ spin_lock_init(&hb->chain_lock);
+ INIT_HLIST_HEAD(&hb->chain);
+ }
+
+ seqlock_init(&f->rnd_seqlock);
+ f->last_rebuild_jiffies = 0;
+ f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+ NULL);
+ if (!f->frags_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(inet_frags_init);
+
+void inet_frags_init_net(struct netns_frags *nf)
+{
+ init_frag_mem_limit(nf);
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
+void inet_frags_fini(struct inet_frags *f)
+{
+ cancel_work_sync(&f->frags_work);
+ kmem_cache_destroy(f->frags_cachep);
+}
+EXPORT_SYMBOL(inet_frags_fini);
+
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+ unsigned int seq;
+ int i;
+
+ nf->low_thresh = 0;
+ local_bh_disable();
+
+evict_again:
+ seq = read_seqbegin(&f->rnd_seqlock);
+
+ for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+ inet_evict_bucket(f, &f->hash[i]);
+
+ if (read_seqretry(&f->rnd_seqlock, seq))
+ goto evict_again;
+
+ local_bh_enable();
+
+ percpu_counter_destroy(&nf->mem);
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
+
+static struct inet_frag_bucket *
+get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
+__acquires(hb->chain_lock)
+{
+ struct inet_frag_bucket *hb;
+ unsigned int seq, hash;
+
+ restart:
+ seq = read_seqbegin(&f->rnd_seqlock);
+
+ hash = inet_frag_hashfn(f, fq);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
+ if (read_seqretry(&f->rnd_seqlock, seq)) {
+ spin_unlock(&hb->chain_lock);
+ goto restart;
+ }
+
+ return hb;
+}
+
+static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+ struct inet_frag_bucket *hb;
+
+ hb = get_frag_bucket_locked(fq, f);
+ if (!(fq->flags & INET_FRAG_EVICTED))
+ hlist_del(&fq->list);
+ spin_unlock(&hb->chain_lock);
+}
+
+void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+ if (del_timer(&fq->timer))
+ atomic_dec(&fq->refcnt);
+
+ if (!(fq->flags & INET_FRAG_COMPLETE)) {
+ fq_unlink(fq, f);
+ atomic_dec(&fq->refcnt);
+ fq->flags |= INET_FRAG_COMPLETE;
+ }
+}
+EXPORT_SYMBOL(inet_frag_kill);
+
+static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
+ struct sk_buff *skb)
+{
+ if (f->skb_free)
+ f->skb_free(skb);
+ kfree_skb(skb);
+}
+
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+{
+ struct sk_buff *fp;
+ struct netns_frags *nf;
+ unsigned int sum, sum_truesize = 0;
+
+ WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+ WARN_ON(del_timer(&q->timer) != 0);
+
+ /* Release all fragment data. */
+ fp = q->fragments;
+ nf = q->net;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
+ fp = xp;
+ }
+ sum = sum_truesize + f->qsize;
+ sub_frag_mem_limit(q, sum);
+
+ if (f->destructor)
+ f->destructor(q);
+ kmem_cache_free(f->frags_cachep, q);
+}
+EXPORT_SYMBOL(inet_frag_destroy);
+
+static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
+ struct inet_frag_queue *qp_in,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
+ struct inet_frag_queue *qp;
+
+#ifdef CONFIG_SMP
+ /* With SMP race we have to recheck hash table, because
+ * such entry could have been created on other cpu before
+ * we acquired hash bucket lock.
+ */
+ hlist_for_each_entry(qp, &hb->chain, list) {
+ if (qp->net == nf && f->match(qp, arg)) {
+ atomic_inc(&qp->refcnt);
+ spin_unlock(&hb->chain_lock);
+ qp_in->flags |= INET_FRAG_COMPLETE;
+ inet_frag_put(qp_in, f);
+ return qp;
+ }
+ }
+#endif
+ qp = qp_in;
+ if (!mod_timer(&qp->timer, jiffies + nf->timeout))
+ atomic_inc(&qp->refcnt);
+
+ atomic_inc(&qp->refcnt);
+ hlist_add_head(&qp->list, &hb->chain);
+
+ spin_unlock(&hb->chain_lock);
+
+ return qp;
+}
+
+static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_queue *q;
+
+ if (frag_mem_limit(nf) > nf->high_thresh) {
+ inet_frag_schedule_worker(f);
+ return NULL;
+ }
+
+ q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
+ if (!q)
+ return NULL;
+
+ q->net = nf;
+ f->constructor(q, arg);
+ add_frag_mem_limit(q, f->qsize);
+
+ setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+ spin_lock_init(&q->lock);
+ atomic_set(&q->refcnt, 1);
+
+ return q;
+}
+
+static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_queue *q;
+
+ q = inet_frag_alloc(nf, f, arg);
+ if (!q)
+ return NULL;
+
+ return inet_frag_intern(nf, q, f, arg);
+}
+
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+ struct inet_frags *f, void *key,
+ unsigned int hash)
+{
+ struct inet_frag_bucket *hb;
+ struct inet_frag_queue *q;
+ int depth = 0;
+
+ if (frag_mem_limit(nf) > nf->low_thresh)
+ inet_frag_schedule_worker(f);
+
+ hash &= (INETFRAGS_HASHSZ - 1);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
+ hlist_for_each_entry(q, &hb->chain, list) {
+ if (q->net == nf && f->match(q, key)) {
+ atomic_inc(&q->refcnt);
+ spin_unlock(&hb->chain_lock);
+ return q;
+ }
+ depth++;
+ }
+ spin_unlock(&hb->chain_lock);
+
+ if (depth <= INETFRAGS_MAXDEPTH)
+ return inet_frag_create(nf, f, key);
+
+ if (inet_frag_may_rebuild(f)) {
+ if (!f->rebuild)
+ f->rebuild = true;
+ inet_frag_schedule_worker(f);
+ }
+
+ return ERR_PTR(-ENOBUFS);
+}
+EXPORT_SYMBOL(inet_frag_find);
+
+void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix)
+{
+ static const char msg[] = "inet_frag_find: Fragment hash bucket"
+ " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+ ". Dropping fragment.\n";
+
+ if (PTR_ERR(q) == -ENOBUFS)
+ net_dbg_ratelimited("%s%s", prefix, msg);
+}
+EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 000000000..c6fb80bd5
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,618 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/secure_seq.h>
+#include <net/ip.h>
+
+static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 inet_ehash_secret __read_mostly;
+
+ net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
+}
+
+/* This function handles inet_sock, but also timewait and request sockets
+ * for IPv4/IPv6.
+ */
+u32 sk_ehashfn(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return inet6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+#endif
+ return inet_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+}
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+ struct net *net,
+ struct inet_bind_hashbucket *head,
+ const unsigned short snum)
+{
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+ if (tb) {
+ write_pnet(&tb->ib_net, net);
+ tb->port = snum;
+ tb->fastreuse = 0;
+ tb->fastreuseport = 0;
+ tb->num_owners = 0;
+ INIT_HLIST_HEAD(&tb->owners);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+}
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+{
+ if (hlist_empty(&tb->owners)) {
+ __hlist_del(&tb->node);
+ kmem_cache_free(cachep, tb);
+ }
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ const unsigned short snum)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+ atomic_inc(&hashinfo->bsockets);
+
+ inet_sk(sk)->inet_num = snum;
+ sk_add_bind_node(sk, &tb->owners);
+ tb->num_owners++;
+ inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
+ hashinfo->bhash_size);
+ struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ atomic_dec(&hashinfo->bsockets);
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ __sk_del_bind_node(sk);
+ tb->num_owners--;
+ inet_csk(sk)->icsk_bind_hash = NULL;
+ inet_sk(sk)->inet_num = 0;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct sock *sk)
+{
+ local_bh_disable();
+ __inet_put_port(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(inet_put_port);
+
+int __inet_inherit_port(struct sock *sk, struct sock *child)
+{
+ struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+ unsigned short port = inet_sk(child)->inet_num;
+ const int bhash = inet_bhashfn(sock_net(sk), port,
+ table->bhash_size);
+ struct inet_bind_hashbucket *head = &table->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ if (tb->port != port) {
+ /* NOTE: using tproxy and redirecting skbs to a proxy
+ * on a different listener port breaks the assumption
+ * that the listener socket's icsk_bind_hash is the same
+ * as that of the child socket. We have to look up or
+ * create a new bind bucket for the child here. */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), sock_net(sk)) &&
+ tb->port == port)
+ break;
+ }
+ if (!tb) {
+ tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+ sock_net(sk), head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+ }
+ }
+ }
+ inet_bind_hash(child, tb, port);
+ spin_unlock(&head->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__inet_inherit_port);
+
+static inline int compute_score(struct sock *sk, struct net *net,
+ const unsigned short hnum, const __be32 daddr,
+ const int dif)
+{
+ int score = -1;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
+ !ipv6_only_sock(sk)) {
+ __be32 rcv_saddr = inet->inet_rcv_saddr;
+ score = sk->sk_family == PF_INET ? 2 : 1;
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ return -1;
+ score += 4;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+ }
+ return score;
+}
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+
+
+struct sock *__inet_lookup_listener(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned int hash = inet_lhashfn(net, hnum);
+ struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
+
+ rcu_read_lock();
+begin:
+ result = NULL;
+ hiscore = 0;
+ sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+ score = compute_score(sk, net, hnum, daddr, dif);
+ if (score > hiscore) {
+ result = sk;
+ hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (reciprocal_scale(phash, matches) == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+ goto begin;
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, hnum, daddr,
+ dif) < hiscore)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_free(inet_twsk(sk));
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ reqsk_free(inet_reqsk(sk));
+ else
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
+void sock_edemux(struct sk_buff *skb)
+{
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
+struct sock *__inet_lookup_established(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif)
+{
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ struct sock *sk;
+ const struct hlist_nulls_node *node;
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ unsigned int slot = hash & hashinfo->ehash_mask;
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+ rcu_read_lock();
+begin:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+ goto out;
+ if (unlikely(!INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ sock_gen_put(sk);
+ goto begin;
+ }
+ goto found;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+out:
+ sk = NULL;
+found:
+ rcu_read_unlock();
+ return sk;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 daddr = inet->inet_rcv_saddr;
+ __be32 saddr = inet->inet_daddr;
+ int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+ struct net *net = sock_net(sk);
+ unsigned int hash = inet_ehashfn(net, daddr, lport,
+ saddr, inet->inet_dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_nulls_node *node;
+ struct inet_timewait_sock *tw = NULL;
+ int twrefcnt = 0;
+
+ spin_lock(lock);
+
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(INET_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ break;
+ }
+ goto not_unique;
+ }
+ }
+
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity.
+ */
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
+ sk->sk_hash = hash;
+ WARN_ON(!sk_unhashed(sk));
+ __sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ twrefcnt = inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
+ spin_unlock(lock);
+ if (twrefcnt)
+ inet_twsk_put(tw);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ if (twp) {
+ *twp = tw;
+ } else if (tw) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw);
+
+ inet_twsk_put(tw);
+ }
+ return 0;
+
+not_unique:
+ spin_unlock(lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+ inet->inet_daddr,
+ inet->inet_dport);
+}
+
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct hlist_nulls_head *list;
+ struct inet_ehash_bucket *head;
+ spinlock_t *lock;
+ int twrefcnt = 0;
+
+ WARN_ON(!sk_unhashed(sk));
+
+ sk->sk_hash = sk_ehashfn(sk);
+ head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ list = &head->chain;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock(lock);
+ __sk_nulls_add_node_rcu(sk, list);
+ if (tw) {
+ WARN_ON(sk->sk_hash != tw->tw_hash);
+ twrefcnt = inet_twsk_unhash(tw);
+ }
+ spin_unlock(lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return twrefcnt;
+}
+EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+
+int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_listen_hashbucket *ilb;
+
+ if (sk->sk_state != TCP_LISTEN)
+ return __inet_hash_nolisten(sk, tw);
+
+ WARN_ON(!sk_unhashed(sk));
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+
+ spin_lock(&ilb->lock);
+ __sk_nulls_add_node_rcu(sk, &ilb->head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ spin_unlock(&ilb->lock);
+ return 0;
+}
+EXPORT_SYMBOL(__inet_hash);
+
+void inet_hash(struct sock *sk)
+{
+ if (sk->sk_state != TCP_CLOSE) {
+ local_bh_disable();
+ __inet_hash(sk, NULL);
+ local_bh_enable();
+ }
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ spinlock_t *lock;
+ int done;
+
+ if (sk_unhashed(sk))
+ return;
+
+ if (sk->sk_state == TCP_LISTEN)
+ lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+ else
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ done = __sk_nulls_del_node_init_rcu(sk);
+ if (done)
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(lock);
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
+
+int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk, u32 port_offset,
+ int (*check_established)(struct inet_timewait_death_row *,
+ struct sock *, __u16, struct inet_timewait_sock **))
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const unsigned short snum = inet_sk(sk)->inet_num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+ struct net *net = sock_net(sk);
+ int twrefcnt = 1;
+
+ if (!snum) {
+ int i, remaining, low, high, port;
+ static u32 hint;
+ u32 offset = hint + port_offset;
+ struct inet_timewait_sock *tw = NULL;
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+
+ local_bh_disable();
+ for (i = 1; i <= remaining; i++) {
+ port = low + (i + offset) % remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), net) &&
+ tb->port == port) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
+ goto next_port;
+ WARN_ON(hlist_empty(&tb->owners));
+ if (!check_established(death_row, sk,
+ port, &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+ twrefcnt += __inet_hash_nolisten(sk, tw);
+ }
+ if (tw)
+ twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ inet_twsk_deschedule(tw);
+ while (twrefcnt) {
+ twrefcnt--;
+ inet_twsk_put(tw);
+ }
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ __inet_hash_nolisten(sk, NULL);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = check_established(death_row, sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+ __inet_check_established);
+}
+EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+ int i;
+
+ atomic_set(&h->bsockets, 0);
+ for (i = 0; i < INET_LHTABLE_SIZE; i++) {
+ spin_lock_init(&h->listening_hash[i].lock);
+ INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
+ i + LISTENING_NULLS_BASE);
+ }
+}
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
new file mode 100644
index 000000000..f17ea49b2
--- /dev/null
+++ b/net/ipv4/inet_lro.c
@@ -0,0 +1,374 @@
+/*
+ * linux/net/ipv4/inet_lro.c
+ *
+ * Large Receive Offload (ipv4 / tcp)
+ *
+ * (C) Copyright IBM Corp. 2007
+ *
+ * Authors:
+ * Jan-Bernd Themann <themann@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+#include <linux/if_vlan.h>
+#include <linux/inet_lro.h>
+#include <net/checksum.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
+MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
+
+#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
+#define IP_HDR_LEN(iph) (iph->ihl << 2)
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+ (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+#define LRO_MAX_PG_HLEN 64
+
+#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
+
+/*
+ * Basic tcp checks whether packet is suitable for LRO
+ */
+
+static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
+ int len, const struct net_lro_desc *lro_desc)
+{
+ /* check ip header: don't aggregate padded frames */
+ if (ntohs(iph->tot_len) != len)
+ return -1;
+
+ if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
+ return -1;
+
+ if (iph->ihl != IPH_LEN_WO_OPTIONS)
+ return -1;
+
+ if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
+ tcph->rst || tcph->syn || tcph->fin)
+ return -1;
+
+ if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+ return -1;
+
+ if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
+ tcph->doff != TCPH_LEN_W_TIMESTAMP)
+ return -1;
+
+ /* check tcp options (only timestamp allowed) */
+ if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+ __be32 *topt = (__be32 *)(tcph + 1);
+
+ if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return -1;
+
+ /* timestamp should be in right order */
+ topt++;
+ if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
+ ntohl(*topt)))
+ return -1;
+
+ /* timestamp reply should not be zero */
+ topt++;
+ if (*topt == 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
+{
+ struct iphdr *iph = lro_desc->iph;
+ struct tcphdr *tcph = lro_desc->tcph;
+ __be32 *p;
+ __wsum tcp_hdr_csum;
+
+ tcph->ack_seq = lro_desc->tcp_ack;
+ tcph->window = lro_desc->tcp_window;
+
+ if (lro_desc->tcp_saw_tstamp) {
+ p = (__be32 *)(tcph + 1);
+ *(p+2) = lro_desc->tcp_rcv_tsecr;
+ }
+
+ csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
+ iph->tot_len = htons(lro_desc->ip_tot_len);
+
+ tcph->check = 0;
+ tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
+ lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
+ tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ lro_desc->ip_tot_len -
+ IP_HDR_LEN(iph), IPPROTO_TCP,
+ lro_desc->data_csum);
+}
+
+static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
+{
+ __wsum tcp_csum;
+ __wsum tcp_hdr_csum;
+ __wsum tcp_ps_hdr_csum;
+
+ tcp_csum = ~csum_unfold(tcph->check);
+ tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
+
+ tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ len + TCP_HDR_LEN(tcph),
+ IPPROTO_TCP, 0);
+
+ return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
+ tcp_ps_hdr_csum);
+}
+
+static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ int nr_frags;
+ __be32 *ptr;
+ u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ nr_frags = skb_shinfo(skb)->nr_frags;
+ lro_desc->parent = skb;
+ lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
+ lro_desc->iph = iph;
+ lro_desc->tcph = tcph;
+ lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+ lro_desc->tcp_ack = tcph->ack_seq;
+ lro_desc->tcp_window = tcph->window;
+
+ lro_desc->pkt_aggr_cnt = 1;
+ lro_desc->ip_tot_len = ntohs(iph->tot_len);
+
+ if (tcph->doff == 8) {
+ ptr = (__be32 *)(tcph+1);
+ lro_desc->tcp_saw_tstamp = 1;
+ lro_desc->tcp_rcv_tsval = *(ptr+1);
+ lro_desc->tcp_rcv_tsecr = *(ptr+2);
+ }
+
+ lro_desc->mss = tcp_data_len;
+ lro_desc->active = 1;
+
+ lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
+ tcp_data_len);
+}
+
+static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
+{
+ memset(lro_desc, 0, sizeof(struct net_lro_desc));
+}
+
+static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
+ struct tcphdr *tcph, int tcp_data_len)
+{
+ struct sk_buff *parent = lro_desc->parent;
+ __be32 *topt;
+
+ lro_desc->pkt_aggr_cnt++;
+ lro_desc->ip_tot_len += tcp_data_len;
+ lro_desc->tcp_next_seq += tcp_data_len;
+ lro_desc->tcp_window = tcph->window;
+ lro_desc->tcp_ack = tcph->ack_seq;
+
+ /* don't update tcp_rcv_tsval, would not work with PAWS */
+ if (lro_desc->tcp_saw_tstamp) {
+ topt = (__be32 *) (tcph + 1);
+ lro_desc->tcp_rcv_tsecr = *(topt + 2);
+ }
+
+ lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
+ lro_tcp_data_csum(iph, tcph,
+ tcp_data_len),
+ parent->len);
+
+ parent->len += tcp_data_len;
+ parent->data_len += tcp_data_len;
+ if (tcp_data_len > lro_desc->mss)
+ lro_desc->mss = tcp_data_len;
+}
+
+static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ struct sk_buff *parent = lro_desc->parent;
+ int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ lro_add_common(lro_desc, iph, tcph, tcp_data_len);
+
+ skb_pull(skb, (skb->len - tcp_data_len));
+ parent->truesize += skb->truesize;
+
+ if (lro_desc->last_skb)
+ lro_desc->last_skb->next = skb;
+ else
+ skb_shinfo(parent)->frag_list = skb;
+
+ lro_desc->last_skb = skb;
+}
+
+
+static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ if ((lro_desc->iph->saddr != iph->saddr) ||
+ (lro_desc->iph->daddr != iph->daddr) ||
+ (lro_desc->tcph->source != tcph->source) ||
+ (lro_desc->tcph->dest != tcph->dest))
+ return -1;
+ return 0;
+}
+
+static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
+ struct net_lro_desc *lro_arr,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ struct net_lro_desc *lro_desc = NULL;
+ struct net_lro_desc *tmp;
+ int max_desc = lro_mgr->max_desc;
+ int i;
+
+ for (i = 0; i < max_desc; i++) {
+ tmp = &lro_arr[i];
+ if (tmp->active)
+ if (!lro_check_tcp_conn(tmp, iph, tcph)) {
+ lro_desc = tmp;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < max_desc; i++) {
+ if (!lro_arr[i].active) {
+ lro_desc = &lro_arr[i];
+ goto out;
+ }
+ }
+
+ LRO_INC_STATS(lro_mgr, no_desc);
+out:
+ return lro_desc;
+}
+
+static void lro_flush(struct net_lro_mgr *lro_mgr,
+ struct net_lro_desc *lro_desc)
+{
+ if (lro_desc->pkt_aggr_cnt > 1)
+ lro_update_tcp_ip_header(lro_desc);
+
+ skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
+
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(lro_desc->parent);
+ else
+ netif_rx(lro_desc->parent);
+
+ LRO_INC_STATS(lro_mgr, flushed);
+ lro_clear_desc(lro_desc);
+}
+
+static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
+ void *priv)
+{
+ struct net_lro_desc *lro_desc;
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ u64 flags;
+ int vlan_hdr_len = 0;
+
+ if (!lro_mgr->get_skb_header ||
+ lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+ &flags, priv))
+ goto out;
+
+ if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
+ goto out;
+
+ lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+ if (!lro_desc)
+ goto out;
+
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+ vlan_hdr_len = VLAN_HLEN;
+
+ if (!lro_desc->active) { /* start new lro session */
+ if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
+ goto out;
+
+ skb->ip_summed = lro_mgr->ip_summed_aggr;
+ lro_init_desc(lro_desc, skb, iph, tcph);
+ LRO_INC_STATS(lro_mgr, aggregated);
+ return 0;
+ }
+
+ if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
+ goto out2;
+
+ if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
+ goto out2;
+
+ lro_add_packet(lro_desc, skb, iph, tcph);
+ LRO_INC_STATS(lro_mgr, aggregated);
+
+ if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
+ lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
+ lro_flush(lro_mgr, lro_desc);
+
+ return 0;
+
+out2: /* send aggregated SKBs to stack */
+ lro_flush(lro_mgr, lro_desc);
+
+out:
+ return 1;
+}
+
+void lro_receive_skb(struct net_lro_mgr *lro_mgr,
+ struct sk_buff *skb,
+ void *priv)
+{
+ if (__lro_proc_skb(lro_mgr, skb, priv)) {
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(skb);
+ else
+ netif_rx(skb);
+ }
+}
+EXPORT_SYMBOL(lro_receive_skb);
+
+void lro_flush_all(struct net_lro_mgr *lro_mgr)
+{
+ int i;
+ struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
+
+ for (i = 0; i < lro_mgr->max_desc; i++) {
+ if (lro_desc[i].active)
+ lro_flush(lro_mgr, &lro_desc[i]);
+ }
+}
+EXPORT_SYMBOL(lro_flush_all);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 000000000..00ec8d5d7
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,328 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic TIME_WAIT sockets functions
+ *
+ * From code orinally in TCP
+ */
+
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+
+/**
+ * inet_twsk_unhash - unhash a timewait socket from established hash
+ * @tw: timewait socket
+ *
+ * unhash a timewait socket from established hash, if hashed.
+ * ehash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_unhash(struct inet_timewait_sock *tw)
+{
+ if (hlist_nulls_unhashed(&tw->tw_node))
+ return 0;
+
+ hlist_nulls_del_rcu(&tw->tw_node);
+ sk_nulls_node_init(&tw->tw_node);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
+/**
+ * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ * @tw: timewait socket
+ * @hashinfo: hashinfo pointer
+ *
+ * unhash a timewait socket from bind hash, if hashed.
+ * bind hash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_bucket *tb = tw->tw_tb;
+
+ if (!tb)
+ return 0;
+
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
+/* Must be called with locally disabled BHs. */
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ struct inet_bind_hashbucket *bhead;
+ int refcnt;
+ /* Unlink from established hashes. */
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ spin_lock(lock);
+ refcnt = inet_twsk_unhash(tw);
+ spin_unlock(lock);
+
+ /* Disassociate with bind bucket. */
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
+ hashinfo->bhash_size)];
+
+ spin_lock(&bhead->lock);
+ refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+ spin_unlock(&bhead->lock);
+
+ BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
+ atomic_sub(refcnt, &tw->tw_refcnt);
+ atomic_dec(&tw->tw_dr->tw_count);
+ inet_twsk_put(tw);
+}
+
+void inet_twsk_free(struct inet_timewait_sock *tw)
+{
+ struct module *owner = tw->tw_prot->owner;
+ twsk_destructor((struct sock *)tw);
+#ifdef SOCK_REFCNT_DEBUG
+ pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
+#endif
+ kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+ module_put(owner);
+}
+
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+ if (atomic_dec_and_test(&tw->tw_refcnt))
+ inet_twsk_free(tw);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+ struct hlist_nulls_head *list)
+{
+ hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind_node, list);
+}
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+ struct inet_hashinfo *hashinfo)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ struct inet_bind_hashbucket *bhead;
+ /* Step 1: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with inet->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
+ hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = icsk->icsk_bind_hash;
+ WARN_ON(!icsk->icsk_bind_hash);
+ inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ spin_unlock(&bhead->lock);
+
+ spin_lock(lock);
+
+ /*
+ * Step 2: Hash TW into tcp ehash chain.
+ * Notes :
+ * - tw_refcnt is set to 3 because :
+ * - We have one reference from bhash chain.
+ * - We have one reference from ehash chain.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
+ */
+ atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ inet_twsk_add_node_rcu(tw, &ehead->chain);
+
+ /* Step 3: Remove SK from hash chain */
+ if (__sk_nulls_del_node_init_rcu(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+void tw_timer_handler(unsigned long data)
+{
+ struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
+
+ if (tw->tw_kill)
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+ else
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+ inet_twsk_kill(tw);
+}
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+ struct inet_timewait_death_row *dr,
+ const int state)
+{
+ struct inet_timewait_sock *tw;
+
+ if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ return NULL;
+
+ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ GFP_ATOMIC);
+ if (tw) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ kmemcheck_annotate_bitfield(tw, flags);
+
+ tw->tw_dr = dr;
+ /* Give us an identity. */
+ tw->tw_daddr = inet->inet_daddr;
+ tw->tw_rcv_saddr = inet->inet_rcv_saddr;
+ tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_tos = inet->tos;
+ tw->tw_num = inet->inet_num;
+ tw->tw_state = TCP_TIME_WAIT;
+ tw->tw_substate = state;
+ tw->tw_sport = inet->inet_sport;
+ tw->tw_dport = inet->inet_dport;
+ tw->tw_family = sk->sk_family;
+ tw->tw_reuse = sk->sk_reuse;
+ tw->tw_hash = sk->sk_hash;
+ tw->tw_ipv6only = 0;
+ tw->tw_transparent = inet->transparent;
+ tw->tw_prot = sk->sk_prot_creator;
+ atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
+ twsk_net_set(tw, sock_net(sk));
+ setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
+ /*
+ * Because we use RCU lookups, we should not set tw_refcnt
+ * to a non null value before everything is setup for this
+ * timewait socket.
+ */
+ atomic_set(&tw->tw_refcnt, 0);
+
+ __module_get(tw->tw_prot->owner);
+ }
+
+ return tw;
+}
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+{
+ if (del_timer_sync(&tw->tw_timer))
+ inet_twsk_kill(tw);
+}
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
+{
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+
+ tw->tw_kill = timeo <= 4*HZ;
+ if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
+ atomic_inc(&tw->tw_refcnt);
+ atomic_inc(&tw->tw_dr->tw_count);
+ }
+}
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
+ struct inet_timewait_death_row *twdr, int family)
+{
+ struct inet_timewait_sock *tw;
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ unsigned int slot;
+
+ for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+ cond_resched();
+ rcu_read_lock();
+restart:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_state != TCP_TIME_WAIT)
+ continue;
+ tw = inet_twsk(sk);
+ if ((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))
+ continue;
+
+ if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
+ continue;
+
+ if (unlikely((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))) {
+ inet_twsk_put(tw);
+ goto restart;
+ }
+
+ rcu_read_unlock();
+ local_bh_disable();
+ inet_twsk_deschedule(tw);
+ local_bh_enable();
+ inet_twsk_put(tw);
+ goto restart_rcu;
+ }
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 000000000..241afd743
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,558 @@
+/*
+ * INETPEER - A storage for permanent information about peers
+ *
+ * This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ * Authors: Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <net/ip.h>
+#include <net/inetpeer.h>
+#include <net/secure_seq.h>
+
+/*
+ * Theory of operations.
+ * We keep one entry for each peer IP address. The nodes contains long-living
+ * information about the peer which doesn't depend on routes.
+ *
+ * Nodes are removed only when reference counter goes to 0.
+ * When it's happened the node may be removed when a sufficient amount of
+ * time has been passed since its last use. The less-recently-used entry can
+ * also be removed if the pool is overloaded i.e. if the total amount of
+ * entries is greater-or-equal than the threshold.
+ *
+ * Node pool is organised as an AVL tree.
+ * Such an implementation has been chosen not just for fun. It's a way to
+ * prevent easy and efficient DoS attacks by creating hash collisions. A huge
+ * amount of long living nodes in a single hash slot would significantly delay
+ * lookups performed with disabled BHs.
+ *
+ * Serialisation issues.
+ * 1. Nodes may appear in the tree only with the pool lock held.
+ * 2. Nodes may disappear from the tree only with the pool lock held
+ * AND reference count being 0.
+ * 3. Global variable peer_total is modified under the pool lock.
+ * 4. struct inet_peer fields modification:
+ * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * refcnt: atomically against modifications on other CPU;
+ * usually under some other lock to prevent node disappearing
+ * daddr: unchangeable
+ */
+
+static struct kmem_cache *peer_cachep __read_mostly;
+
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
+#define node_height(x) x->avl_height
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+ .avl_left = peer_avl_empty_rcu,
+ .avl_right = peer_avl_empty_rcu,
+ .avl_height = 0
+};
+
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->root = peer_avl_empty_rcu;
+ seqlock_init(&bp->lock);
+ bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
+
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+/* Exported for sysctl_net_ipv4. */
+int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
+ * aggressively at this stage */
+int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
+int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
+
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+ struct inet_peer *p, *n, *c;
+ struct list_head list;
+
+ spin_lock_bh(&gc_lock);
+ list_replace_init(&gc_list, &list);
+ spin_unlock_bh(&gc_lock);
+
+ if (list_empty(&list))
+ return;
+
+ list_for_each_entry_safe(p, n, &list, gc_list) {
+
+ if (need_resched())
+ cond_resched();
+
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
+ }
+
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
+ }
+
+ n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+ if (!atomic_read(&p->refcnt)) {
+ list_del(&p->gc_list);
+ kmem_cache_free(peer_cachep, p);
+ }
+ }
+
+ if (list_empty(&list))
+ return;
+
+ spin_lock_bh(&gc_lock);
+ list_splice(&list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
+
+/* Called from ip_output.c:ip_init */
+void __init inet_initpeers(void)
+{
+ struct sysinfo si;
+
+ /* Use the straight interface to information about memory. */
+ si_meminfo(&si);
+ /* The values below were suggested by Alexey Kuznetsov
+ * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
+ * myself. --SAW
+ */
+ if (si.totalram <= (32768*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+ if (si.totalram <= (16384*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* about 512KB */
+ if (si.totalram <= (8192*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 2; /* about 128KB */
+
+ peer_cachep = kmem_cache_create("inet_peer_cache",
+ sizeof(struct inet_peer),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ NULL);
+
+ INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
+}
+
+static int addr_compare(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ int i, n = (a->family == AF_INET ? 1 : 4);
+
+ for (i = 0; i < n; i++) {
+ if (a->addr.a6[i] == b->addr.a6[i])
+ continue;
+ if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
+ return -1;
+ return 1;
+ }
+
+ return 0;
+}
+
+#define rcu_deref_locked(X, BASE) \
+ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
+
+/*
+ * Called with local BH disabled and the pool lock held.
+ */
+#define lookup(_daddr, _stack, _base) \
+({ \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
+ \
+ stackptr = _stack; \
+ *stackptr++ = &_base->root; \
+ for (u = rcu_deref_locked(_base->root, _base); \
+ u != peer_avl_empty;) { \
+ int cmp = addr_compare(_daddr, &u->daddr); \
+ if (cmp == 0) \
+ break; \
+ if (cmp == -1) \
+ v = &u->avl_left; \
+ else \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = rcu_deref_locked(*v, _base); \
+ } \
+ u; \
+})
+
+/*
+ * Called with rcu_read_lock()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base)
+{
+ struct inet_peer *u = rcu_dereference(base->root);
+ int count = 0;
+
+ while (u != peer_avl_empty) {
+ int cmp = addr_compare(daddr, &u->daddr);
+ if (cmp == 0) {
+ /* Before taking a reference, check if this entry was
+ * deleted (refcnt=-1)
+ */
+ if (!atomic_add_unless(&u->refcnt, 1, -1))
+ u = NULL;
+ return u;
+ }
+ if (cmp == -1)
+ u = rcu_dereference(u->avl_left);
+ else
+ u = rcu_dereference(u->avl_right);
+ if (unlikely(++count == PEER_MAXDEPTH))
+ break;
+ }
+ return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup_rightempty(start, base) \
+({ \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
+ *stackptr++ = &start->avl_left; \
+ v = &start->avl_left; \
+ for (u = rcu_deref_locked(*v, base); \
+ u->avl_right != peer_avl_empty_rcu;) { \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = rcu_deref_locked(*v, base); \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
+ struct inet_peer __rcu ***stackend,
+ struct inet_peer_base *base)
+{
+ struct inet_peer __rcu **nodep;
+ struct inet_peer *node, *l, *r;
+ int lh, rh;
+
+ while (stackend > stack) {
+ nodep = *--stackend;
+ node = rcu_deref_locked(*nodep, base);
+ l = rcu_deref_locked(node->avl_left, base);
+ r = rcu_deref_locked(node->avl_right, base);
+ lh = node_height(l);
+ rh = node_height(r);
+ if (lh > rh + 1) { /* l: RH+2 */
+ struct inet_peer *ll, *lr, *lrl, *lrr;
+ int lrh;
+ ll = rcu_deref_locked(l->avl_left, base);
+ lr = rcu_deref_locked(l->avl_right, base);
+ lrh = node_height(lr);
+ if (lrh <= node_height(ll)) { /* ll: RH+1 */
+ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
+ node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
+ RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
+ l->avl_height = node->avl_height + 1;
+ RCU_INIT_POINTER(*nodep, l);
+ } else { /* ll: RH, lr: RH+1 */
+ lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
+ lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
+ node->avl_height = rh + 1; /* node: RH+1 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
+ RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
+ l->avl_height = rh + 1; /* l: RH+1 */
+ RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
+ RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
+ lr->avl_height = rh + 2;
+ RCU_INIT_POINTER(*nodep, lr);
+ }
+ } else if (rh > lh + 1) { /* r: LH+2 */
+ struct inet_peer *rr, *rl, *rlr, *rll;
+ int rlh;
+ rr = rcu_deref_locked(r->avl_right, base);
+ rl = rcu_deref_locked(r->avl_left, base);
+ rlh = node_height(rl);
+ if (rlh <= node_height(rr)) { /* rr: LH+1 */
+ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
+ node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
+ RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
+ r->avl_height = node->avl_height + 1;
+ RCU_INIT_POINTER(*nodep, r);
+ } else { /* rr: RH, rl: RH+1 */
+ rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
+ rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
+ node->avl_height = lh + 1; /* node: LH+1 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
+ RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
+ r->avl_height = lh + 1; /* r: LH+1 */
+ RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
+ RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
+ rl->avl_height = lh + 2;
+ RCU_INIT_POINTER(*nodep, rl);
+ }
+ } else {
+ node->avl_height = (lh > rh ? lh : rh) + 1;
+ }
+ }
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define link_to_pool(n, base) \
+do { \
+ n->avl_height = 1; \
+ n->avl_left = peer_avl_empty_rcu; \
+ n->avl_right = peer_avl_empty_rcu; \
+ /* lockless readers can catch us now */ \
+ rcu_assign_pointer(**--stackptr, n); \
+ peer_avl_rebalance(stack, stackptr, base); \
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
+
+static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH])
+{
+ struct inet_peer __rcu ***stackptr, ***delp;
+
+ if (lookup(&p->daddr, stack, base) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty_rcu) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p, base);
+ BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
+ **--stackptr = t->avl_left;
+ /* t is removed, t->daddr > x->daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ RCU_INIT_POINTER(*delp[0], t);
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ BUG_ON(delp[1] != &p->avl_left);
+ delp[1] = &t->avl_left; /* was &p->avl_left */
+ }
+ peer_avl_rebalance(stack, stackptr, base);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
+}
+
+/* perform garbage collect on all items stacked during a lookup */
+static int inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH],
+ struct inet_peer __rcu ***stackptr)
+{
+ struct inet_peer *p, *gchead = NULL;
+ __u32 delta, ttl;
+ int cnt = 0;
+
+ if (base->total >= inet_peer_threshold)
+ ttl = 0; /* be aggressive */
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ base->total / inet_peer_threshold * HZ;
+ stackptr--; /* last stack slot is peer_avl_empty */
+ while (stackptr > stack) {
+ stackptr--;
+ p = rcu_deref_locked(**stackptr, base);
+ if (atomic_read(&p->refcnt) == 0) {
+ smp_rmb();
+ delta = (__u32)jiffies - p->dtime;
+ if (delta >= ttl &&
+ atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
+ p->gc_next = gchead;
+ gchead = p;
+ }
+ }
+ }
+ while ((p = gchead) != NULL) {
+ gchead = p->gc_next;
+ cnt++;
+ unlink_from_pool(p, base, stack);
+ }
+ return cnt;
+}
+
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr,
+ int create)
+{
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
+ struct inet_peer *p;
+ unsigned int sequence;
+ int invalidated, gccnt = 0;
+
+ /* Attempt a lockless lookup first.
+ * Because of a concurrent writer, we might not find an existing entry.
+ */
+ rcu_read_lock();
+ sequence = read_seqbegin(&base->lock);
+ p = lookup_rcu(daddr, base);
+ invalidated = read_seqretry(&base->lock, sequence);
+ rcu_read_unlock();
+
+ if (p)
+ return p;
+
+ /* If no writer did a change during our lookup, we can return early. */
+ if (!create && !invalidated)
+ return NULL;
+
+ /* retry an exact lookup, taking the lock before.
+ * At least, nodes should be hot in our cache.
+ */
+ write_seqlock_bh(&base->lock);
+relookup:
+ p = lookup(daddr, stack, base);
+ if (p != peer_avl_empty) {
+ atomic_inc(&p->refcnt);
+ write_sequnlock_bh(&base->lock);
+ return p;
+ }
+ if (!gccnt) {
+ gccnt = inet_peer_gc(base, stack, stackptr);
+ if (gccnt && create)
+ goto relookup;
+ }
+ p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+ if (p) {
+ p->daddr = *daddr;
+ atomic_set(&p->refcnt, 1);
+ atomic_set(&p->rid, 0);
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+ INIT_LIST_HEAD(&p->gc_list);
+
+ /* Link the node. */
+ link_to_pool(p, base);
+ base->total++;
+ }
+ write_sequnlock_bh(&base->lock);
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(inet_getpeer);
+
+void inet_putpeer(struct inet_peer *p)
+{
+ p->dtime = (__u32)jiffies;
+ smp_mb__before_atomic();
+ atomic_dec(&p->refcnt);
+}
+EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
+ }
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+static void inetpeer_inval_rcu(struct rcu_head *head)
+{
+ struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
+
+ spin_lock_bh(&gc_lock);
+ list_add_tail(&p->gc_list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
+
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
+{
+ struct inet_peer *root;
+
+ write_seqlock_bh(&base->lock);
+
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
+ base->total = 0;
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
+ }
+
+ write_sequnlock_bh(&base->lock);
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 000000000..367448494
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,159 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP forwarding functionality.
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip_input.c for
+ * history.
+ * Dave Gregorich : NULL ip_rt_put fix for multicast
+ * routing.
+ * Jos Vos : Add call_out_firewall before sending,
+ * use output device for accounting.
+ * Jos Vos : Call forward firewall after routing
+ * (always use output device).
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+ return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+ skb->ignore_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+
+static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ skb_sender_cpu_clear(skb);
+ return dst_output_sk(sk, skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+ u32 mtu;
+ struct iphdr *iph; /* Our header */
+ struct rtable *rt; /* Route we use */
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ /* that should never happen */
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if (unlikely(skb->sk))
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ goto drop;
+
+ if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+ return NET_RX_SUCCESS;
+
+ skb_forward_csum(skb);
+
+ /*
+ * According to the RFC, we must first decrease the TTL field. If
+ * that reaches zero, we must reply an ICMP control message telling
+ * that the packet's lifetime expired.
+ */
+ if (ip_hdr(skb)->ttl <= 1)
+ goto too_many_hops;
+
+ if (!xfrm4_route_forward(skb))
+ goto drop;
+
+ rt = skb_rtable(skb);
+
+ if (opt->is_strictroute && rt->rt_uses_gateway)
+ goto sr_failed;
+
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
+ IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ goto drop;
+ }
+
+ /* We are about to mangle packet. Copy it! */
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
+ goto drop;
+ iph = ip_hdr(skb);
+
+ /* Decrease ttl after skb cow done */
+ ip_decrease_ttl(iph);
+
+ /*
+ * We now generate an ICMP HOST REDIRECT giving the route
+ * we calculated.
+ */
+ if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
+ !skb_sec_path(skb))
+ ip_rt_send_redirect(skb);
+
+ skb->priority = rt_tos2priority(iph->tos);
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
+ skb->dev, rt->dst.dev, ip_forward_finish);
+
+sr_failed:
+ /*
+ * Strict routing permits no gatewaying
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+ goto drop;
+
+too_many_hops:
+ /* Tell the sender its packet died... */
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 000000000..cc1da6d9c
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,871 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP fragmentation functionality.
+ *
+ * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * Fixes:
+ * Alan Cox : Split from ip.c , see ip_input.c for history.
+ * David S. Miller : Begin massive cleanup...
+ * Andi Kleen : Add sysctls.
+ * xxxx : Overlapfrag bug.
+ * Ultima : ip_expire() kernel panic.
+ * Bill Hawes : Frag accounting and evictor fixes.
+ * John McDonald : 0 length frag bug.
+ * Alexey Kuznetsov: SMP races, threading, cleanup.
+ * Patrick McHardy : LRU queue of frag heads for evictor.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/inet_frag.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
+static int sysctl_ipfrag_max_dist __read_mostly = 64;
+static const char ip_frag_cache_name[] = "ip4-frags";
+
+struct ipfrag_skb_cb
+{
+ struct inet_skb_parm h;
+ int offset;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+ struct inet_frag_queue q;
+
+ u32 user;
+ __be32 saddr;
+ __be32 daddr;
+ __be16 id;
+ u8 protocol;
+ u8 ecn; /* RFC3168 support */
+ int iif;
+ unsigned int rid;
+ struct inet_peer *peer;
+};
+
+static u8 ip4_frag_ecn(u8 tos)
+{
+ return 1 << (tos & INET_ECN_MASK);
+}
+
+static struct inet_frags ip4_frags;
+
+int ip_frag_mem(struct net *net)
+{
+ return sum_frag_mem_limit(&net->ipv4.frags);
+}
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+ struct net_device *dev);
+
+struct ip4_create_arg {
+ struct iphdr *iph;
+ u32 user;
+};
+
+static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
+{
+ net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
+ return jhash_3words((__force u32)id << 16 | prot,
+ (__force u32)saddr, (__force u32)daddr,
+ ip4_frags.rnd);
+}
+
+static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
+{
+ const struct ipq *ipq;
+
+ ipq = container_of(q, struct ipq, q);
+ return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
+}
+
+static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
+{
+ const struct ipq *qp;
+ const struct ip4_create_arg *arg = a;
+
+ qp = container_of(q, struct ipq, q);
+ return qp->id == arg->iph->id &&
+ qp->saddr == arg->iph->saddr &&
+ qp->daddr == arg->iph->daddr &&
+ qp->protocol == arg->iph->protocol &&
+ qp->user == arg->user;
+}
+
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
+{
+ struct ipq *qp = container_of(q, struct ipq, q);
+ struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+ frags);
+ struct net *net = container_of(ipv4, struct net, ipv4);
+
+ const struct ip4_create_arg *arg = a;
+
+ qp->protocol = arg->iph->protocol;
+ qp->id = arg->iph->id;
+ qp->ecn = ip4_frag_ecn(arg->iph->tos);
+ qp->saddr = arg->iph->saddr;
+ qp->daddr = arg->iph->daddr;
+ qp->user = arg->user;
+ qp->peer = sysctl_ipfrag_max_dist ?
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
+}
+
+static void ip4_frag_free(struct inet_frag_queue *q)
+{
+ struct ipq *qp;
+
+ qp = container_of(q, struct ipq, q);
+ if (qp->peer)
+ inet_putpeer(qp->peer);
+}
+
+
+/* Destruction primitives. */
+
+static void ipq_put(struct ipq *ipq)
+{
+ inet_frag_put(&ipq->q, &ip4_frags);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+ inet_frag_kill(&ipq->q, &ip4_frags);
+}
+
+/*
+ * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+ struct ipq *qp;
+ struct net *net;
+
+ qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+ net = container_of(qp->q.net, struct net, ipv4.frags);
+
+ spin_lock(&qp->q.lock);
+
+ if (qp->q.flags & INET_FRAG_COMPLETE)
+ goto out;
+
+ ipq_kill(qp);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+
+ if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+ struct sk_buff *head = qp->q.fragments;
+ const struct iphdr *iph;
+ int err;
+
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+ goto out;
+
+ rcu_read_lock();
+ head->dev = dev_get_by_index_rcu(net, qp->iif);
+ if (!head->dev)
+ goto out_rcu_unlock;
+
+ /* skb has no dst, perform route lookup again */
+ iph = ip_hdr(head);
+ err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ iph->tos, head->dev);
+ if (err)
+ goto out_rcu_unlock;
+
+ /* Only an end host needs to send an ICMP
+ * "Fragment Reassembly Timeout" message, per RFC792.
+ */
+ if (qp->user == IP_DEFRAG_AF_PACKET ||
+ ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
+ (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ goto out_rcu_unlock;
+
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+out_rcu_unlock:
+ rcu_read_unlock();
+ }
+out:
+ spin_unlock(&qp->q.lock);
+ ipq_put(qp);
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+{
+ struct inet_frag_queue *q;
+ struct ip4_create_arg arg;
+ unsigned int hash;
+
+ arg.iph = iph;
+ arg.user = user;
+
+ hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+
+ q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
+ return container_of(q, struct ipq, q);
+}
+
+/* Is the fragment too far ahead to be part of ipq? */
+static int ip_frag_too_far(struct ipq *qp)
+{
+ struct inet_peer *peer = qp->peer;
+ unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int start, end;
+
+ int rc;
+
+ if (!peer || !max)
+ return 0;
+
+ start = qp->rid;
+ end = atomic_inc_return(&peer->rid);
+ qp->rid = end;
+
+ rc = qp->q.fragments && (end - start) > max;
+
+ if (rc) {
+ struct net *net;
+
+ net = container_of(qp->q.net, struct net, ipv4.frags);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ }
+
+ return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+ struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
+
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+ atomic_inc(&qp->q.refcnt);
+ return -ETIMEDOUT;
+ }
+
+ fp = qp->q.fragments;
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
+ fp = xp;
+ } while (fp);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
+
+ qp->q.flags = 0;
+ qp->q.len = 0;
+ qp->q.meat = 0;
+ qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
+ qp->iif = 0;
+ qp->ecn = 0;
+
+ return 0;
+}
+
+/* Add new segment to existing queue. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *next;
+ struct net_device *dev;
+ int flags, offset;
+ int ihl, end;
+ int err = -ENOENT;
+ u8 ecn;
+
+ if (qp->q.flags & INET_FRAG_COMPLETE)
+ goto err;
+
+ if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+ unlikely(ip_frag_too_far(qp)) &&
+ unlikely(err = ip_frag_reinit(qp))) {
+ ipq_kill(qp);
+ goto err;
+ }
+
+ ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
+ offset = ntohs(ip_hdr(skb)->frag_off);
+ flags = offset & ~IP_OFFSET;
+ offset &= IP_OFFSET;
+ offset <<= 3; /* offset is in 8-byte chunks */
+ ihl = ip_hdrlen(skb);
+
+ /* Determine the position of this fragment. */
+ end = offset + skb->len - ihl;
+ err = -EINVAL;
+
+ /* Is this the final fragment? */
+ if ((flags & IP_MF) == 0) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < qp->q.len ||
+ ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
+ goto err;
+ qp->q.flags |= INET_FRAG_LAST_IN;
+ qp->q.len = end;
+ } else {
+ if (end&7) {
+ end &= ~7;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (end > qp->q.len) {
+ /* Some bits beyond end -> corruption. */
+ if (qp->q.flags & INET_FRAG_LAST_IN)
+ goto err;
+ qp->q.len = end;
+ }
+ }
+ if (end == offset)
+ goto err;
+
+ err = -ENOMEM;
+ if (!pskb_pull(skb, ihl))
+ goto err;
+
+ err = pskb_trim_rcsum(skb, end - offset);
+ if (err)
+ goto err;
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = qp->q.fragments_tail;
+ if (!prev || FRAG_CB(prev)->offset < offset) {
+ next = NULL;
+ goto found;
+ }
+ prev = NULL;
+ for (next = qp->q.fragments; next != NULL; next = next->next) {
+ if (FRAG_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+found:
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ err = -EINVAL;
+ if (end <= offset)
+ goto err;
+ err = -ENOMEM;
+ if (!pskb_pull(skb, i))
+ goto err;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
+ err = -ENOMEM;
+
+ while (next && FRAG_CB(next)->offset < end) {
+ int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ if (!pskb_pull(next, i))
+ goto err;
+ FRAG_CB(next)->offset += i;
+ qp->q.meat -= i;
+ if (next->ip_summed != CHECKSUM_UNNECESSARY)
+ next->ip_summed = CHECKSUM_NONE;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
+
+ /* Old fragment is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ qp->q.fragments = next;
+
+ qp->q.meat -= free_it->len;
+ sub_frag_mem_limit(&qp->q, free_it->truesize);
+ kfree_skb(free_it);
+ }
+ }
+
+ FRAG_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (!next)
+ qp->q.fragments_tail = skb;
+ if (prev)
+ prev->next = skb;
+ else
+ qp->q.fragments = skb;
+
+ dev = skb->dev;
+ if (dev) {
+ qp->iif = dev->ifindex;
+ skb->dev = NULL;
+ }
+ qp->q.stamp = skb->tstamp;
+ qp->q.meat += skb->len;
+ qp->ecn |= ecn;
+ add_frag_mem_limit(&qp->q, skb->truesize);
+ if (offset == 0)
+ qp->q.flags |= INET_FRAG_FIRST_IN;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ skb->len + ihl > qp->q.max_size)
+ qp->q.max_size = skb->len + ihl;
+
+ if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ qp->q.meat == qp->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = ip_frag_reasm(qp, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ return -EINPROGRESS;
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
+
+/* Build a new IP datagram from all its fragments. */
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+ struct net_device *dev)
+{
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct iphdr *iph;
+ struct sk_buff *fp, *head = qp->q.fragments;
+ int len;
+ int ihlen;
+ int err;
+ int sum_truesize;
+ u8 ecn;
+
+ ipq_kill(qp);
+
+ ecn = ip_frag_ecn_table[qp->ecn];
+ if (unlikely(ecn == 0xff)) {
+ err = -EINVAL;
+ goto out_fail;
+ }
+ /* Make the one we just received the head. */
+ if (prev) {
+ head = prev->next;
+ fp = skb_clone(head, GFP_ATOMIC);
+ if (!fp)
+ goto out_nomem;
+
+ fp->next = head->next;
+ if (!fp->next)
+ qp->q.fragments_tail = fp;
+ prev->next = fp;
+
+ skb_morph(head, qp->q.fragments);
+ head->next = qp->q.fragments->next;
+
+ consume_skb(qp->q.fragments);
+ qp->q.fragments = head;
+ }
+
+ WARN_ON(!head);
+ WARN_ON(FRAG_CB(head)->offset != 0);
+
+ /* Allocate a new buffer for the datagram. */
+ ihlen = ip_hdrlen(head);
+ len = ihlen + qp->q.len;
+
+ err = -E2BIG;
+ if (len > 65535)
+ goto out_oversize;
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ goto out_nomem;
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_nomem;
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(&qp->q, clone->truesize);
+ }
+
+ skb_push(head, head->data - skb_network_header(head));
+
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
+ }
+ sub_frag_mem_limit(&qp->q, sum_truesize);
+
+ head->next = NULL;
+ head->dev = dev;
+ head->tstamp = qp->q.stamp;
+ IPCB(head)->frag_max_size = qp->q.max_size;
+
+ iph = ip_hdr(head);
+ /* max_size != 0 implies at least one fragment had IP_DF set */
+ iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
+ iph->tot_len = htons(len);
+ iph->tos |= ecn;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+ qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
+ return 0;
+
+out_nomem:
+ net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
+ err = -ENOMEM;
+ goto out_fail;
+out_oversize:
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
+out_fail:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ return err;
+}
+
+/* Process an incoming IP datagram fragment. */
+int ip_defrag(struct sk_buff *skb, u32 user)
+{
+ struct ipq *qp;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+
+ /* Lookup (or create) queue header */
+ qp = ip_find(net, ip_hdr(skb), user);
+ if (qp) {
+ int ret;
+
+ spin_lock(&qp->q.lock);
+
+ ret = ip_frag_queue(qp, skb);
+
+ spin_unlock(&qp->q.lock);
+ ipq_put(qp);
+ return ret;
+ }
+
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(ip_defrag);
+
+struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+{
+ struct iphdr iph;
+ int netoff;
+ u32 len;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return skb;
+
+ netoff = skb_network_offset(skb);
+
+ if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
+ return skb;
+
+ if (iph.ihl < 5 || iph.version != 4)
+ return skb;
+
+ len = ntohs(iph.tot_len);
+ if (skb->len < netoff + len || len < (iph.ihl * 4))
+ return skb;
+
+ if (ip_is_fragment(&iph)) {
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb) {
+ if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
+ return skb;
+ if (pskb_trim_rcsum(skb, netoff + len))
+ return skb;
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ if (ip_defrag(skb, user))
+ return NULL;
+ skb_clear_hash(skb);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(ip_check_defrag);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table ip4_frags_ns_ctl_table[] = {
+ {
+ .procname = "ipfrag_high_thresh",
+ .data = &init_net.ipv4.frags.high_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.ipv4.frags.low_thresh
+ },
+ {
+ .procname = "ipfrag_low_thresh",
+ .data = &init_net.ipv4.frags.low_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.ipv4.frags.high_thresh
+ },
+ {
+ .procname = "ipfrag_time",
+ .data = &init_net.ipv4.frags.timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
+static struct ctl_table ip4_frags_ctl_table[] = {
+ {
+ .procname = "ipfrag_secret_interval",
+ .data = &ip4_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ipfrag_max_dist",
+ .data = &sysctl_ipfrag_max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero
+ },
+ { }
+};
+
+static int __net_init ip4_frags_ns_ctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = ip4_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->ipv4.frags.high_thresh;
+ table[0].extra1 = &net->ipv4.frags.low_thresh;
+ table[0].extra2 = &init_net.ipv4.frags.high_thresh;
+ table[1].data = &net->ipv4.frags.low_thresh;
+ table[1].extra2 = &net->ipv4.frags.high_thresh;
+ table[2].data = &net->ipv4.frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ipv4.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.frags_hdr);
+ kfree(table);
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
+}
+#else
+static int ip4_frags_ns_ctl_register(struct net *net)
+{
+ return 0;
+}
+
+static void ip4_frags_ns_ctl_unregister(struct net *net)
+{
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+}
+#endif
+
+static int __net_init ipv4_frags_init_net(struct net *net)
+{
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
+ */
+ net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
+ net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
+ /*
+ * Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival
+ * by TTL.
+ */
+ net->ipv4.frags.timeout = IP_FRAG_TIME;
+
+ inet_frags_init_net(&net->ipv4.frags);
+
+ return ip4_frags_ns_ctl_register(net);
+}
+
+static void __net_exit ipv4_frags_exit_net(struct net *net)
+{
+ ip4_frags_ns_ctl_unregister(net);
+ inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+}
+
+static struct pernet_operations ip4_frags_ops = {
+ .init = ipv4_frags_init_net,
+ .exit = ipv4_frags_exit_net,
+};
+
+void __init ipfrag_init(void)
+{
+ ip4_frags_ctl_register();
+ register_pernet_subsys(&ip4_frags_ops);
+ ip4_frags.hashfn = ip4_hashfn;
+ ip4_frags.constructor = ip4_frag_init;
+ ip4_frags.destructor = ip4_frag_free;
+ ip4_frags.skb_free = NULL;
+ ip4_frags.qsize = sizeof(struct ipq);
+ ip4_frags.match = ip4_frag_match;
+ ip4_frags.frag_expire = ip_expire;
+ ip4_frags.frags_cache_name = ip_frag_cache_name;
+ if (inet_frags_init(&ip4_frags))
+ panic("IP: failed to allocate ip4_frags cache\n");
+}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 000000000..5fd706473
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,926 @@
+/*
+ * Linux NET3: GRE over IP protocol decoder.
+ *
+ * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+ Problems & solutions
+ --------------------
+
+ 1. The most important issue is detecting local dead loops.
+ They would cause complete host lockup in transmit, which
+ would be "resolved" by stack overflow or, if queueing is enabled,
+ with infinite looping in net_bh.
+
+ We cannot track such dead loops during route installation,
+ it is infeasible task. The most general solutions would be
+ to keep skb->encapsulation counter (sort of local ttl),
+ and silently drop packet when it expires. It is a good
+ solution, but it supposes maintaining new variable in ALL
+ skb, even if no tunneling is used.
+
+ Current solution: xmit_recursion breaks dead loops. This is a percpu
+ counter, since when we enter the first ndo_xmit(), cpu migration is
+ forbidden. We force an exit if this counter reaches RECURSION_LIMIT
+
+ 2. Networking dead loops would not kill routers, but would really
+ kill network. IP hop limit plays role of "t->recursion" in this case,
+ if we copy it from packet being encapsulated to upper header.
+ It is very good solution, but it introduces two problems:
+
+ - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+ do not work over tunnels.
+ - traceroute does not work. I planned to relay ICMP from tunnel,
+ so that this problem would be solved and traceroute output
+ would even more informative. This idea appeared to be wrong:
+ only Linux complies to rfc1812 now (yes, guys, Linux is the only
+ true router now :-)), all routers (at least, in neighbourhood of mine)
+ return only 8 bytes of payload. It is the end.
+
+ Hence, if we want that OSPF worked or traceroute said something reasonable,
+ we should search for another solution.
+
+ One of them is to parse packet trying to detect inner encapsulation
+ made by our node. It is difficult or even impossible, especially,
+ taking into account fragmentation. TO be short, ttl is not solution at all.
+
+ Current solution: The solution was UNEXPECTEDLY SIMPLE.
+ We force DF flag on tunnels with preconfigured hop limit,
+ that is ALL. :-) Well, it does not remove the problem completely,
+ but exponential growth of network traffic is changed to linear
+ (branches, that exceed pmtu are pruned) and tunnel mtu
+ rapidly degrades to value <68, where looping stops.
+ Yes, it is not good if there exists a router in the loop,
+ which does not force DF, even when encapsulating packets have DF set.
+ But it is not our problem! Nobody could accuse us, we made
+ all that we could make. Even if it is your gated who injected
+ fatal route to network, even if it were you who configured
+ fatal static route: you are innocent. :-)
+
+ Alexey Kuznetsov.
+ */
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static int ipgre_tunnel_init(struct net_device *dev);
+
+static int ipgre_net_id __read_mostly;
+static int gre_tap_net_id __read_mostly;
+
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
+{
+
+ /* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft
+ state for keyed GRE tunnels with enabled checksum. Tell
+ them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standards established
+ by themselves???
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return PACKET_RCVD;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return PACKET_RCVD;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return PACKET_RCVD;
+ break;
+
+ case ICMP_REDIRECT:
+ break;
+ }
+
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return PACKET_REJECT;
+
+ if (t->parms.iph.daddr == 0 ||
+ ipv4_is_multicast(t->parms.iph.daddr))
+ return PACKET_RCVD;
+
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ return PACKET_RCVD;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+ return PACKET_RCVD;
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->saddr, iph->daddr, tpi->key);
+
+ if (tunnel) {
+ skb_pop_mac_header(skb);
+ ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ return PACKET_RCVD;
+ }
+ return PACKET_REJECT;
+}
+
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params,
+ __be16 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct tnl_ptk_info tpi;
+
+ tpi.flags = tunnel->parms.o_flags;
+ tpi.proto = proto;
+ tpi.key = tunnel->parms.o_key;
+ if (tunnel->parms.o_flags & TUNNEL_SEQ)
+ tunnel->o_seqno++;
+ tpi.seq = htonl(tunnel->o_seqno);
+
+ /* Push GRE header. */
+ gre_build_header(skb, &tpi, tunnel->tun_hlen);
+
+ skb_set_inner_protocol(skb, tpi.proto);
+
+ ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
+}
+
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tnl_params;
+
+ if (dev->header_ops) {
+ /* Need space for new headers */
+ if (skb_cow_head(skb, dev->needed_headroom -
+ (tunnel->hlen + sizeof(struct iphdr))))
+ goto free_skb;
+
+ tnl_params = (const struct iphdr *)skb->data;
+
+ /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
+ * to gre header.
+ */
+ skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ skb_reset_mac_header(skb);
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
+
+ tnl_params = &tunnel->parms.iph;
+ }
+
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
+
+ __gre_xmit(skb, dev, tnl_params, skb->protocol);
+
+ return NETDEV_TX_OK;
+
+free_skb:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
+
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
+
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
+
+ return NETDEV_TX_OK;
+
+free_skb:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static int ipgre_tunnel_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
+{
+ int err;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ return -EINVAL;
+ }
+ p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
+ p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+ return 0;
+}
+
+/* Nice toy. Unfortunately, useless in real life :-)
+ It allows to construct virtual multiprotocol broadcast "LAN"
+ over the Internet, provided multicast routing is tuned.
+
+
+ I have no idea was this bicycle invented before me,
+ so that I had to set ARPHRD_IPGRE to a random value.
+ I have an impression, that Cisco could make something similar,
+ but this feature is apparently missing in IOS<=11.2(8).
+
+ I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+ with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+ ping -t 255 224.66.66.66
+
+ If nobody answers, mbone does not work.
+
+ ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+ ip addr add 10.66.66.<somewhat>/24 dev Universe
+ ifconfig Universe up
+ ifconfig Universe add fe80::<Your_real_addr>/10
+ ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+ ftp 10.66.66.66
+ ...
+ ftp fec0:6666:6666::193.233.7.65
+ ...
+ */
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct iphdr *iph;
+ struct gre_base_hdr *greh;
+
+ iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
+ greh = (struct gre_base_hdr *)(iph+1);
+ greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->protocol = htons(type);
+
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+
+ /* Set the source hardware address. */
+ if (saddr)
+ memcpy(&iph->saddr, saddr, 4);
+ if (daddr)
+ memcpy(&iph->daddr, daddr, 4);
+ if (iph->daddr)
+ return t->hlen + sizeof(*iph);
+
+ return -(t->hlen + sizeof(*iph));
+}
+
+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
+ memcpy(haddr, &iph->saddr, 4);
+ return 4;
+}
+
+static const struct header_ops ipgre_header_ops = {
+ .create = ipgre_header,
+ .parse = ipgre_header_parse,
+};
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+static int ipgre_open(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (ipv4_is_multicast(t->parms.iph.daddr)) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rt = ip_route_output_gre(t->net, &fl4,
+ t->parms.iph.daddr,
+ t->parms.iph.saddr,
+ t->parms.o_key,
+ RT_TOS(t->parms.iph.tos),
+ t->parms.link);
+ if (IS_ERR(rt))
+ return -EADDRNOTAVAIL;
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!__in_dev_get_rtnl(dev))
+ return -EADDRNOTAVAIL;
+ t->mlink = dev->ifindex;
+ ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+ }
+ return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
+ struct in_device *in_dev;
+ in_dev = inetdev_by_index(t->net, t->mlink);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+ }
+ return 0;
+}
+#endif
+
+static const struct net_device_ops ipgre_netdev_ops = {
+ .ndo_init = ipgre_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ .ndo_open = ipgre_open,
+ .ndo_stop = ipgre_close,
+#endif
+ .ndo_start_xmit = ipgre_xmit,
+ .ndo_do_ioctl = ipgre_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+#define GRE_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &ipgre_netdev_ops;
+ dev->type = ARPHRD_IPGRE;
+ ip_tunnel_setup(dev, ipgre_net_id);
+}
+
+static void __gre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel;
+ int t_hlen;
+
+ tunnel = netdev_priv(dev);
+ tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+ t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+ dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported. */
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ __gre_tunnel_init(dev);
+
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->flags = IFF_NOARP;
+ netif_keep_dst(dev);
+ dev->addr_len = 4;
+
+ if (iph->daddr) {
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ if (!iph->saddr)
+ return -EINVAL;
+ dev->flags = IFF_BROADCAST;
+ dev->header_ops = &ipgre_header_ops;
+ }
+#endif
+ } else
+ dev->header_ops = &ipgre_header_ops;
+
+ return ip_tunnel_init(dev);
+}
+
+static struct gre_cisco_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
+ .priority = 0,
+};
+
+static int __net_init ipgre_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
+}
+
+static void __net_exit ipgre_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
+ ip_tunnel_delete_net(itn, &ipgre_link_ops);
+}
+
+static struct pernet_operations ipgre_net_ops = {
+ .init = ipgre_init_net,
+ .exit = ipgre_exit_net,
+ .id = &ipgre_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be16 flags;
+
+ if (!data)
+ return 0;
+
+ flags = 0;
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (flags & (GRE_VERSION|GRE_ROUTING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be32 daddr;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ goto out;
+
+ if (data[IFLA_GRE_REMOTE]) {
+ memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+ if (!daddr)
+ return -EINVAL;
+ }
+
+out:
+ return ipgre_tunnel_validate(tb, data);
+}
+
+static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_GRE;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_GRE_LINK])
+ parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+ if (data[IFLA_GRE_IFLAGS])
+ parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
+
+ if (data[IFLA_GRE_OFLAGS])
+ parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
+
+ if (data[IFLA_GRE_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+ if (data[IFLA_GRE_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+ if (data[IFLA_GRE_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
+
+ if (data[IFLA_GRE_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
+
+ if (data[IFLA_GRE_TTL])
+ parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
+
+ if (data[IFLA_GRE_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
+
+ if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+}
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipgre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+static int gre_tap_init(struct net_device *dev)
+{
+ __gre_tunnel_init(dev);
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+ return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_tap_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, gre_tap_net_id);
+}
+
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
+}
+
+static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_GRE_LINK */
+ nla_total_size(4) +
+ /* IFLA_GRE_IFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_OFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_IKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_OKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_GRE_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_GRE_TTL */
+ nla_total_size(1) +
+ /* IFLA_GRE_TOS */
+ nla_total_size(1) +
+ /* IFLA_GRE_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
+ 0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+ [IFLA_GRE_LINK] = { .type = NLA_U32 },
+ [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_IKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_OKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GRE_TTL] = { .type = NLA_U8 },
+ [IFLA_GRE_TOS] = { .type = NLA_U8 },
+ [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
+ .kind = "gre",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tunnel_setup,
+ .validate = ipgre_tunnel_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+ .kind = "gretap",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tap_setup,
+ .validate = ipgre_tap_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+}
+
+static void __net_exit ipgre_tap_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
+ ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit = ipgre_tap_exit_net,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipgre_init(void)
+{
+ int err;
+
+ pr_info("GRE over IPv4 tunneling driver\n");
+
+ err = register_pernet_device(&ipgre_net_ops);
+ if (err < 0)
+ return err;
+
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_faied;
+
+ err = gre_cisco_register(&ipgre_protocol);
+ if (err < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ goto add_proto_failed;
+ }
+
+ err = rtnl_link_register(&ipgre_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ err = rtnl_link_register(&ipgre_tap_ops);
+ if (err < 0)
+ goto tap_ops_failed;
+
+ return 0;
+
+tap_ops_failed:
+ rtnl_link_unregister(&ipgre_link_ops);
+rtnl_link_failed:
+ gre_cisco_unregister(&ipgre_protocol);
+add_proto_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_faied:
+ unregister_pernet_device(&ipgre_net_ops);
+ return err;
+}
+
+static void __exit ipgre_fini(void)
+{
+ rtnl_link_unregister(&ipgre_tap_ops);
+ rtnl_link_unregister(&ipgre_link_ops);
+ gre_cisco_unregister(&ipgre_protocol);
+ unregister_pernet_device(&ipgre_tap_net_ops);
+ unregister_pernet_device(&ipgre_net_ops);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("gre");
+MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 000000000..2db4c8773
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,467 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) module.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ *
+ * Fixes:
+ * Alan Cox : Commented a couple of minor bits of surplus code
+ * Alan Cox : Undefining IP_FORWARD doesn't include the code
+ * (just stops a compiler warning).
+ * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ * are junked rather than corrupting things.
+ * Alan Cox : Frames to bad broadcast subnets are dumped
+ * We used to process them non broadcast and
+ * boy could that cause havoc.
+ * Alan Cox : ip_forward sets the free flag on the
+ * new frame it queues. Still crap because
+ * it copies the frame but at least it
+ * doesn't eat memory too.
+ * Alan Cox : Generic queue code and memory fixes.
+ * Fred Van Kempen : IP fragment support (borrowed from NET2E)
+ * Gerhard Koerting: Forward fragmented frames correctly.
+ * Gerhard Koerting: Fixes to my fix of the above 8-).
+ * Gerhard Koerting: IP interface addressing fix.
+ * Linus Torvalds : More robustness checks
+ * Alan Cox : Even more checks: Still not as robust as it ought to be
+ * Alan Cox : Save IP header pointer for later
+ * Alan Cox : ip option setting
+ * Alan Cox : Use ip_tos/ip_ttl settings
+ * Alan Cox : Fragmentation bogosity removed
+ * (Thanks to Mark.Bush@prg.ox.ac.uk)
+ * Dmitry Gorodchanin : Send of a raw packet crash fix.
+ * Alan Cox : Silly ip bug when an overlength
+ * fragment turns up. Now frees the
+ * queue.
+ * Linus Torvalds/ : Memory leakage on fragmentation
+ * Alan Cox : handling.
+ * Gerhard Koerting: Forwarding uses IP priority hints
+ * Teemu Rantanen : Fragment problems.
+ * Alan Cox : General cleanup, comments and reformat
+ * Alan Cox : SNMP statistics
+ * Alan Cox : BSD address rule semantics. Also see
+ * UDP as there is a nasty checksum issue
+ * if you do things the wrong way.
+ * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
+ * Alan Cox : IP options adjust sk->priority.
+ * Pedro Roque : Fix mtu/length error in ip_forward.
+ * Alan Cox : Avoid ip_chk_addr when possible.
+ * Richard Underwood : IP multicasting.
+ * Alan Cox : Cleaned up multicast handlers.
+ * Alan Cox : RAW sockets demultiplex in the BSD style.
+ * Gunther Mayer : Fix the SNMP reporting typo
+ * Alan Cox : Always in group 224.0.0.1
+ * Pauline Middelink : Fast ip_checksum update when forwarding
+ * Masquerading support.
+ * Alan Cox : Multicast loopback error for 224.0.0.1
+ * Alan Cox : IP_MULTICAST_LOOP option.
+ * Alan Cox : Use notifiers.
+ * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
+ * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
+ * Stefan Becker : Send out ICMP HOST REDIRECT
+ * Arnt Gulbrandsen : ip_build_xmit
+ * Alan Cox : Per socket routing cache
+ * Alan Cox : Fixed routing cache, added header cache.
+ * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
+ * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
+ * Alan Cox : Incoming IP option handling.
+ * Alan Cox : Set saddr on raw output frames as per BSD.
+ * Alan Cox : Stopped broadcast source route explosions.
+ * Alan Cox : Can disable source routing
+ * Takeshi Sone : Masquerading didn't work.
+ * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
+ * Alan Cox : Memory leaks, tramples, misc debugging.
+ * Alan Cox : Fixed multicast (by popular demand 8))
+ * Alan Cox : Fixed forwarding (by even more popular demand 8))
+ * Alan Cox : Fixed SNMP statistics [I think]
+ * Gerhard Koerting : IP fragmentation forwarding fix
+ * Alan Cox : Device lock against page fault.
+ * Alan Cox : IP_HDRINCL facility.
+ * Werner Almesberger : Zero fragment bug
+ * Alan Cox : RAW IP frame length bug
+ * Alan Cox : Outgoing firewall on build_xmit
+ * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
+ * Alan Cox : Multicast routing hooks
+ * Jos Vos : Do accounting *before* call_in_firewall
+ * Willy Konynenberg : Transparent proxying support
+ *
+ *
+ *
+ * To Fix:
+ * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ * and could be made very efficient with the addition of some virtual memory hacks to permit
+ * the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ * Output fragmentation wants updating along with the buffer management to use a single
+ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ * fragmentation anyway.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inet_ecn.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * Process Router Attention IP option (RFC 2113)
+ */
+bool ip_call_ra_chain(struct sk_buff *skb)
+{
+ struct ip_ra_chain *ra;
+ u8 protocol = ip_hdr(skb)->protocol;
+ struct sock *last = NULL;
+ struct net_device *dev = skb->dev;
+
+ for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
+ struct sock *sk = ra->sk;
+
+ /* If socket is bound to an interface, only report
+ * the packet if it came from that interface.
+ */
+ if (sk && inet_sk(sk)->inet_num == protocol &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == dev->ifindex) &&
+ net_eq(sock_net(sk), dev_net(dev))) {
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ return true;
+ }
+ if (last) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ raw_rcv(last, skb2);
+ }
+ last = sk;
+ }
+ }
+
+ if (last) {
+ raw_rcv(last, skb);
+ return true;
+ }
+ return false;
+}
+
+static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ {
+ int protocol = ip_hdr(skb)->protocol;
+ const struct net_protocol *ipprot;
+ int raw;
+
+ resubmit:
+ raw = raw_local_deliver(skb, protocol);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ int ret;
+
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ goto out;
+ }
+ nf_reset(skb);
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
+ }
+ kfree_skb(skb);
+ } else {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
+ }
+ }
+ }
+ out:
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+ /*
+ * Reassemble IP fragments.
+ */
+
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ return 0;
+ }
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
+ skb->dev, NULL,
+ ip_local_deliver_finish);
+}
+
+static inline bool ip_rcv_options(struct sk_buff *skb)
+{
+ struct ip_options *opt;
+ const struct iphdr *iph;
+ struct net_device *dev = skb->dev;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ iph = ip_hdr(skb);
+ opt = &(IPCB(skb)->opt);
+ opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+
+ if (ip_options_compile(dev_net(dev), opt, skb)) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+
+ if (unlikely(opt->srr)) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("source route option %pI4 -> %pI4\n",
+ &iph->saddr,
+ &iph->daddr);
+ goto drop;
+ }
+ }
+
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+
+ return false;
+drop:
+ return true;
+}
+
+int sysctl_ip_early_demux __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_ip_early_demux);
+
+static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+
+ if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->early_demux) {
+ ipprot->early_demux(skb);
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ }
+
+ /*
+ * Initialise the virtual path cache for the packet. It describes
+ * how the packet travels inside Linux networking.
+ */
+ if (!skb_dst(skb)) {
+ int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev);
+ if (unlikely(err)) {
+ if (err == -EXDEV)
+ NET_INC_STATS_BH(dev_net(skb->dev),
+ LINUX_MIB_IPRPFILTER);
+ goto drop;
+ }
+ }
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (unlikely(skb_dst(skb)->tclassid)) {
+ struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
+ u32 idx = skb_dst(skb)->tclassid;
+ st[idx&0xFF].o_packets++;
+ st[idx&0xFF].o_bytes += skb->len;
+ st[(idx>>16)&0xFF].i_packets++;
+ st[(idx>>16)&0xFF].i_bytes += skb->len;
+ }
+#endif
+
+ if (iph->ihl > 5 && ip_rcv_options(skb))
+ goto drop;
+
+ rt = skb_rtable(skb);
+ if (rt->rt_type == RTN_MULTICAST) {
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
+ skb->len);
+ } else if (rt->rt_type == RTN_BROADCAST)
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
+ skb->len);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct iphdr *iph;
+ u32 len;
+
+ /* When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+
+ IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+
+ /*
+ * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+ BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+ BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+ IP_ADD_STATS_BH(dev_net(dev),
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto csum_error;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
+ goto inhdr_error;
+
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ skb->transport_header = skb->network_header + iph->ihl*4;
+
+ /* Remove any debris in the socket control block */
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
+ dev, NULL,
+ ip_rcv_finish);
+
+csum_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
+inhdr_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 000000000..bd2467923
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,663 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The options processing module for ip.c
+ *
+ * Authors: A.N.Kuznetsov
+ *
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+ __be32 daddr, struct rtable *rt, int is_frag)
+{
+ unsigned char *iph = skb_network_header(skb);
+
+ memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+ memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ opt = &(IPCB(skb)->opt);
+
+ if (opt->srr)
+ memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+ if (!is_frag) {
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
+ if (opt->ts_needtime) {
+ struct timespec tv;
+ __be32 midtime;
+ getnstimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+ memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ }
+ return;
+ }
+ if (opt->rr) {
+ memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ }
+ if (opt->ts) {
+ memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ opt->ts = 0;
+ opt->ts_needaddr = opt->ts_needtime = 0;
+ }
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
+ const struct ip_options *sopt)
+{
+ unsigned char *sptr, *dptr;
+ int soffset, doffset;
+ int optlen;
+
+ memset(dopt, 0, sizeof(struct ip_options));
+
+ if (sopt->optlen == 0)
+ return 0;
+
+ sptr = skb_network_header(skb);
+ dptr = dopt->__data;
+
+ if (sopt->rr) {
+ optlen = sptr[sopt->rr+1];
+ soffset = sptr[sopt->rr+2];
+ dopt->rr = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->rr, optlen);
+ if (sopt->rr_needaddr && soffset <= optlen) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dptr[2] = soffset + 4;
+ dopt->rr_needaddr = 1;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->ts) {
+ optlen = sptr[sopt->ts+1];
+ soffset = sptr[sopt->ts+2];
+ dopt->ts = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->ts, optlen);
+ if (soffset <= optlen) {
+ if (sopt->ts_needaddr) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dopt->ts_needaddr = 1;
+ soffset += 4;
+ }
+ if (sopt->ts_needtime) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ dopt->ts_needtime = 1;
+ soffset += 4;
+ } else {
+ dopt->ts_needtime = 0;
+
+ if (soffset + 7 <= optlen) {
+ __be32 addr;
+
+ memcpy(&addr, dptr+soffset-1, 4);
+ if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
+ dopt->ts_needtime = 1;
+ soffset += 8;
+ }
+ }
+ }
+ }
+ dptr[2] = soffset;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->srr) {
+ unsigned char *start = sptr+sopt->srr;
+ __be32 faddr;
+
+ optlen = start[1];
+ soffset = start[2];
+ doffset = 0;
+ if (soffset > optlen)
+ soffset = optlen + 1;
+ soffset -= 4;
+ if (soffset > 3) {
+ memcpy(&faddr, &start[soffset-1], 4);
+ for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
+ memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+ /*
+ * RFC1812 requires to fix illegal source routes.
+ */
+ if (memcmp(&ip_hdr(skb)->saddr,
+ &start[soffset + 3], 4) == 0)
+ doffset -= 4;
+ }
+ if (doffset > 3) {
+ __be32 daddr = fib_compute_spec_dst(skb);
+
+ memcpy(&start[doffset-1], &daddr, 4);
+ dopt->faddr = faddr;
+ dptr[0] = start[0];
+ dptr[1] = doffset+3;
+ dptr[2] = 4;
+ dptr += doffset+3;
+ dopt->srr = dopt->optlen + sizeof(struct iphdr);
+ dopt->optlen += doffset+3;
+ dopt->is_strictroute = sopt->is_strictroute;
+ }
+ }
+ if (sopt->cipso) {
+ optlen = sptr[sopt->cipso+1];
+ dopt->cipso = dopt->optlen+sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->cipso, optlen);
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ while (dopt->optlen & 3) {
+ *dptr++ = IPOPT_END;
+ dopt->optlen++;
+ }
+ return 0;
+}
+
+/*
+ * Options "fragmenting", just fill options not
+ * allowed in fragments with NOOPs.
+ * Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff *skb)
+{
+ unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int l = opt->optlen;
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen < 2 || optlen > l)
+ return;
+ if (!IPOPT_COPIED(*optptr))
+ memset(optptr, IPOPT_NOOP, optlen);
+ l -= optlen;
+ optptr += optlen;
+ }
+ opt->ts = 0;
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ opt->ts_needaddr = 0;
+ opt->ts_needtime = 0;
+}
+
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+ if (*spec_dst == htonl(INADDR_ANY))
+ *spec_dst = fib_compute_spec_dst(skb);
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb)
+{
+ __be32 spec_dst = htonl(INADDR_ANY);
+ unsigned char *pp_ptr = NULL;
+ struct rtable *rt = NULL;
+ unsigned char *optptr;
+ unsigned char *iph;
+ int optlen, l;
+
+ if (skb) {
+ rt = skb_rtable(skb);
+ optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+ } else
+ optptr = opt->__data;
+ iph = optptr - sizeof(struct iphdr);
+
+ for (l = opt->optlen; l > 0; ) {
+ switch (*optptr) {
+ case IPOPT_END:
+ for (optptr++, l--; l > 0; optptr++, l--) {
+ if (*optptr != IPOPT_END) {
+ *optptr = IPOPT_END;
+ opt->is_changed = 1;
+ }
+ }
+ goto eol;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ if (unlikely(l < 2)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ optlen = optptr[1];
+ if (optlen < 2 || optlen > l) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ switch (*optptr) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ /* NB: cf RFC-1812 5.2.4.1 */
+ if (opt->srr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (!skb) {
+ if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ memcpy(&opt->faddr, &optptr[3], 4);
+ if (optlen > 7)
+ memmove(&optptr[3], &optptr[7], optlen-7);
+ }
+ opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+ opt->srr = optptr - iph;
+ break;
+ case IPOPT_RR:
+ if (opt->rr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+ opt->is_changed = 1;
+ }
+ optptr[2] += 4;
+ opt->rr_needaddr = 1;
+ }
+ opt->rr = optptr - iph;
+ break;
+ case IPOPT_TIMESTAMP:
+ if (opt->ts) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 5) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ unsigned char *timeptr = NULL;
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ switch (optptr[3]&0xF) {
+ case IPOPT_TS_TSONLY:
+ if (skb)
+ timeptr = &optptr[optptr[2]-1];
+ opt->ts_needtime = 1;
+ optptr[2] += 4;
+ break;
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+ timeptr = &optptr[optptr[2]+3];
+ }
+ opt->ts_needaddr = 1;
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ {
+ __be32 addr;
+ memcpy(&addr, &optptr[optptr[2]-1], 4);
+ if (inet_addr_type(net, addr) == RTN_UNICAST)
+ break;
+ if (skb)
+ timeptr = &optptr[optptr[2]+3];
+ }
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ break;
+ }
+ if (timeptr) {
+ struct timespec tv;
+ u32 midtime;
+ getnstimeofday(&tv);
+ midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
+ put_unaligned_be32(midtime, timeptr);
+ opt->is_changed = 1;
+ }
+ } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ unsigned int overflow = optptr[3]>>4;
+ if (overflow == 15) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ if (skb) {
+ optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+ opt->is_changed = 1;
+ }
+ }
+ opt->ts = optptr - iph;
+ break;
+ case IPOPT_RA:
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] == 0 && optptr[3] == 0)
+ opt->router_alert = optptr - iph;
+ break;
+ case IPOPT_CIPSO:
+ if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ opt->cipso = optptr - iph;
+ if (cipso_v4_validate(skb, &optptr)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+
+eol:
+ if (!pp_ptr)
+ return 0;
+
+error:
+ if (skb) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ip_options_compile);
+
+/*
+ * Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options *opt)
+{
+ if (opt->srr) {
+ unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ memmove(optptr+7, optptr+3, optptr[1]-7);
+ memcpy(optptr+3, &opt->faddr, 4);
+ }
+ if (opt->rr_needaddr) {
+ unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ if (opt->ts) {
+ unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ if (opt->ts_needtime) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ optptr[2] -= 4;
+ }
+ if (opt->ts_needaddr) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ }
+}
+
+static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
+{
+ return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
+ GFP_KERNEL);
+}
+
+static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
+ struct ip_options_rcu *opt, int optlen)
+{
+ while (optlen & 3)
+ opt->opt.__data[optlen++] = IPOPT_END;
+ opt->opt.optlen = optlen;
+ if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
+ kfree(opt);
+ return -EINVAL;
+ }
+ kfree(*optp);
+ *optp = opt;
+ return 0;
+}
+
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
+ unsigned char __user *data, int optlen)
+{
+ struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ return ip_options_get_finish(net, optp, opt, optlen);
+}
+
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
+ unsigned char *data, int optlen)
+{
+ struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen)
+ memcpy(opt->opt.__data, data, optlen);
+ return ip_options_get_finish(net, optp, opt, optlen);
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ unsigned char *optptr;
+ struct rtable *rt = skb_rtable(skb);
+ unsigned char *raw = skb_network_header(skb);
+
+ if (opt->rr_needaddr) {
+ optptr = (unsigned char *)raw + opt->rr;
+ ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
+ opt->is_changed = 1;
+ }
+ if (opt->srr_is_hit) {
+ int srrptr, srrspace;
+
+ optptr = raw + opt->srr;
+
+ for ( srrptr = optptr[2], srrspace = optptr[1];
+ srrptr <= srrspace;
+ srrptr += 4
+ ) {
+ if (srrptr + 3 > srrspace)
+ break;
+ if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
+ break;
+ }
+ if (srrptr + 3 <= srrspace) {
+ opt->is_changed = 1;
+ ip_hdr(skb)->daddr = opt->nexthop;
+ ip_rt_get_source(&optptr[srrptr-1], skb, rt);
+ optptr[2] = srrptr+4;
+ } else {
+ net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+ __func__);
+ }
+ if (opt->ts_needaddr) {
+ optptr = raw + opt->ts;
+ ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
+ opt->is_changed = 1;
+ }
+ }
+ if (opt->is_changed) {
+ opt->is_changed = 0;
+ ip_send_check(ip_hdr(skb));
+ }
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int srrspace, srrptr;
+ __be32 nexthop;
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned char *optptr = skb_network_header(skb) + opt->srr;
+ struct rtable *rt = skb_rtable(skb);
+ struct rtable *rt2;
+ unsigned long orefdst;
+ int err;
+
+ if (!rt)
+ return 0;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return -EINVAL;
+ if (rt->rt_type == RTN_UNICAST) {
+ if (!opt->is_strictroute)
+ return 0;
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+ return -EINVAL;
+ }
+ if (rt->rt_type != RTN_LOCAL)
+ return -EINVAL;
+
+ for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ if (srrptr + 3 > srrspace) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+ return -EINVAL;
+ }
+ memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+ orefdst = skb->_skb_refdst;
+ skb_dst_set(skb, NULL);
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ rt2 = skb_rtable(skb);
+ if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+ skb_dst_drop(skb);
+ skb->_skb_refdst = orefdst;
+ return -EINVAL;
+ }
+ refdst_drop(orefdst);
+ if (rt2->rt_type != RTN_LOCAL)
+ break;
+ /* Superfast 8) loopback forward */
+ iph->daddr = nexthop;
+ opt->is_changed = 1;
+ }
+ if (srrptr <= srrspace) {
+ opt->srr_is_hit = 1;
+ opt->nexthop = nexthop;
+ opt->is_changed = 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 000000000..c65b93a7b
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1591 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) output module.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * See ip_input.c for original log
+ *
+ * Fixes:
+ * Alan Cox : Missing nonblock feature in ip_build_xmit.
+ * Mike Kilburn : htons() missing in ip_build_xmit.
+ * Bradford Johnson: Fix faulty handling of some frames when
+ * no route is found.
+ * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
+ * (in case if packet not accepted by
+ * output firewall rules)
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov: use new route cache
+ * Andi Kleen: Fix broken PMTU recovery and remove
+ * some redundant tests.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Andi Kleen : Replace ip_reply with ip_send_reply.
+ * Andi Kleen : Split fast and slow ip_build_xmit path
+ * for decreased register pressure on x86
+ * and more readibility.
+ * Marc Boucher : When call_out_firewall returns FW_QUEUE,
+ * silently drop skb instead of failing with -EPERM.
+ * Detlev Wengorz : Copy protocol for fragments.
+ * Hirokazu Takahashi: HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi: sendfile() on UDP works now.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+#include <linux/tcp.h>
+
+int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+
+/* Generate a checksum for an outgoing IP datagram. */
+void ip_send_check(struct iphdr *iph)
+{
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+EXPORT_SYMBOL(ip_send_check);
+
+int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
+ skb_dst(skb)->dev, dst_output_sk);
+}
+
+int __ip_local_out(struct sk_buff *skb)
+{
+ return __ip_local_out_sk(skb->sk, skb);
+}
+
+int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = __ip_local_out(skb);
+ if (likely(err == 1))
+ err = dst_output_sk(sk, skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_local_out_sk);
+
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+ int ttl = inet->uc_ttl;
+
+ if (ttl < 0)
+ ttl = ip4_dst_hoplimit(dst);
+ return ttl;
+}
+
+/*
+ * Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = skb_rtable(skb);
+ struct iphdr *iph;
+
+ /* Build the IP header. */
+ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = inet->tos;
+ if (ip_dont_fragment(sk, &rt->dst))
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
+ iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
+ iph->saddr = saddr;
+ iph->protocol = sk->sk_protocol;
+ ip_select_ident(sock_net(sk), skb, sk);
+
+ if (opt && opt->opt.optlen) {
+ iph->ihl += opt->opt.optlen>>2;
+ ip_options_build(skb, &opt->opt, daddr, rt, 0);
+ }
+
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ /* Send it out. */
+ return ip_local_out(skb);
+}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
+static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = (struct rtable *)dst;
+ struct net_device *dev = dst->dev;
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ u32 nexthop;
+
+ if (rt->rt_type == RTN_MULTICAST) {
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+ } else if (rt->rt_type == RTN_BROADCAST)
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+ if (!skb2) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ consume_skb(skb);
+ skb = skb2;
+ }
+
+ rcu_read_lock_bh();
+ nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
+ neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+ if (!IS_ERR(neigh)) {
+ int res = dst_neigh_output(dst, neigh, skb);
+
+ rcu_read_unlock_bh();
+ return res;
+ }
+ rcu_read_unlock_bh();
+
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
+{
+ netdev_features_t features;
+ struct sk_buff *segs;
+ int ret = 0;
+
+ /* common case: locally created skb or seglen is <= mtu */
+ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
+ skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
+ return ip_finish_output2(sk, skb);
+
+ /* Slowpath - GSO segment length is exceeding the dst MTU.
+ *
+ * This can happen in two cases:
+ * 1) TCP GRO packet, DF bit not set
+ * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
+ * from host network stack.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = ip_fragment(sk, segs, ip_finish_output2);
+
+ if (err && ret == 0)
+ ret = err;
+ segs = nskb;
+ } while (segs);
+
+ return ret;
+}
+
+static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
+{
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+ /* Policy lookup after SNAT yielded a new policy */
+ if (skb_dst(skb)->xfrm) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output_sk(sk, skb);
+ }
+#endif
+ if (skb_is_gso(skb))
+ return ip_finish_output_gso(sk, skb);
+
+ if (skb->len > ip_skb_dst_mtu(skb))
+ return ip_fragment(sk, skb, ip_finish_output2);
+
+ return ip_finish_output2(sk, skb);
+}
+
+int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ /*
+ * Multicasts are looped back for other local users
+ */
+
+ if (rt->rt_flags&RTCF_MULTICAST) {
+ if (sk_mc_loop(sk)
+#ifdef CONFIG_IP_MROUTE
+ /* Small optimization: do not loopback not local frames,
+ which returned after forwarding; they will be dropped
+ by ip_mr_input in any case.
+ Note, that local frames are looped back to be delivered
+ to local recipients.
+
+ This check is duplicated in ip_mr_input at the moment.
+ */
+ &&
+ ((rt->rt_flags & RTCF_LOCAL) ||
+ !(IPCB(skb)->flags & IPSKB_FORWARDED))
+#endif
+ ) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
+ }
+
+ /* Multicasts with ttl 0 must not go beyond the host */
+
+ if (ip_hdr(skb)->ttl == 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ if (rt->rt_flags&RTCF_BROADCAST) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
+ NULL, newskb->dev, dev_loopback_xmit);
+ }
+
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
+ skb->dev, ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+int ip_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
+ NULL, dev,
+ ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ * iph->saddr = fl4->saddr;
+ * iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+ BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+ offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+ memcpy(&iph->saddr, &fl4->saddr,
+ sizeof(fl4->saddr) + sizeof(fl4->daddr));
+}
+
+/* Note: skb->sk can be different from sk, in case of tunnels */
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ struct iphdr *iph;
+ int res;
+
+ /* Skip all of this if the packet is already routed,
+ * f.e. by something like SCTP.
+ */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ fl4 = &fl->u.ip4;
+ rt = skb_rtable(skb);
+ if (rt)
+ goto packet_routed;
+
+ /* Make sure we can route this packet. */
+ rt = (struct rtable *)__sk_dst_check(sk, 0);
+ if (!rt) {
+ __be32 daddr;
+
+ /* Use correct destination address if we have options. */
+ daddr = inet->inet_daddr;
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ daddr, inet->inet_saddr,
+ inet->inet_dport,
+ inet->inet_sport,
+ sk->sk_protocol,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ goto no_route;
+ sk_setup_caps(sk, &rt->dst);
+ }
+ skb_dst_set_noref(skb, &rt->dst);
+
+packet_routed:
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto no_route;
+
+ /* OK, we know where to send it, allocate and build IP header. */
+ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
+ iph->protocol = sk->sk_protocol;
+ ip_copy_addrs(iph, fl4);
+
+ /* Transport layer set skb->h.foo itself. */
+
+ if (inet_opt && inet_opt->opt.optlen) {
+ iph->ihl += inet_opt->opt.optlen >> 2;
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+ }
+
+ ip_select_ident_segs(sock_net(sk), skb, sk,
+ skb_shinfo(skb)->gso_segs ?: 1);
+
+ /* TODO : should we use skb->sk here instead of sk ? */
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ res = ip_local_out(skb);
+ rcu_read_unlock();
+ return res;
+
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(skb);
+ return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(ip_queue_xmit);
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+ to->pkt_type = from->pkt_type;
+ to->priority = from->priority;
+ to->protocol = from->protocol;
+ skb_dst_drop(to);
+ skb_dst_copy(to, from);
+ to->dev = from->dev;
+ to->mark = from->mark;
+
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+#ifdef CONFIG_NET_SCHED
+ to->tc_index = from->tc_index;
+#endif
+ nf_copy(to, from);
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ to->ipvs_property = from->ipvs_property;
+#endif
+ skb_copy_secmark(to, from);
+}
+
+/*
+ * This IP datagram is too large to be sent in one piece. Break it up into
+ * smaller pieces (each of size equal to IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+ */
+
+int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph;
+ int ptr;
+ struct net_device *dev;
+ struct sk_buff *skb2;
+ unsigned int mtu, hlen, left, len, ll_rs;
+ int offset;
+ __be16 not_last_frag;
+ struct rtable *rt = skb_rtable(skb);
+ int err = 0;
+
+ dev = rt->dst.dev;
+
+ /*
+ * Point into the IP datagram header.
+ */
+
+ iph = ip_hdr(skb);
+
+ mtu = ip_skb_dst_mtu(skb);
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
+ mtu = mtu - hlen; /* Size of data space */
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (skb->nf_bridge)
+ mtu -= nf_bridge_mtu_reduction(skb);
+#endif
+ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+
+ /* When frag_list is given, use it. First, check its validity:
+ * some transformers could create wrong frag_list or break existing
+ * one, it is not prohibited. In this case fall back to copying.
+ *
+ * LATER: this step can be merged to real generation of fragments,
+ * we can switch to copy when see the first bad fragment.
+ */
+ if (skb_has_frag_list(skb)) {
+ struct sk_buff *frag, *frag2;
+ int first_len = skb_pagelen(skb);
+
+ if (first_len - hlen > mtu ||
+ ((first_len - hlen) & 7) ||
+ ip_is_fragment(iph) ||
+ skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ /* Correct geometry. */
+ if (frag->len > mtu ||
+ ((frag->len & 7) && frag->next) ||
+ skb_headroom(frag) < hlen)
+ goto slow_path_clean;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag))
+ goto slow_path_clean;
+
+ BUG_ON(frag->sk);
+ if (skb->sk) {
+ frag->sk = skb->sk;
+ frag->destructor = sock_wfree;
+ }
+ skb->truesize -= frag->truesize;
+ }
+
+ /* Everything is OK. Generate! */
+
+ err = 0;
+ offset = 0;
+ frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down. */
+ if (frag) {
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iph, hlen);
+ iph = ip_hdr(frag);
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ if (offset == 0)
+ ip_options_fragment(frag);
+ offset += skb->len - hlen;
+ iph->frag_off = htons(offset>>3);
+ if (frag->next)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+ }
+
+ err = output(sk, skb);
+
+ if (!err)
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ if (err || !frag)
+ break;
+
+ skb = frag;
+ frag = skb->next;
+ skb->next = NULL;
+ }
+
+ if (err == 0) {
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ return 0;
+ }
+
+ while (frag) {
+ skb = frag->next;
+ kfree_skb(frag);
+ frag = skb;
+ }
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
+ }
+
+slow_path:
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
+ goto fail;
+ iph = ip_hdr(skb);
+
+ left = skb->len - hlen; /* Space per frame */
+ ptr = hlen; /* Where to start from */
+
+ ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
+
+ /*
+ * Fragment the datagram.
+ */
+
+ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ not_last_frag = iph->frag_off & htons(IP_MF);
+
+ /*
+ * Keep copying data until we run out.
+ */
+
+ while (left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
+ if (!skb2) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, ll_rs);
+ skb_put(skb2, len + hlen);
+ skb_reset_network_header(skb2);
+ skb2->transport_header = skb2->network_header + hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
+ BUG();
+ left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = ip_hdr(skb2);
+ iph->frag_off = htons((offset >> 3));
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (offset == 0)
+ ip_options_fragment(skb);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (left > 0 || not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+ iph->tot_len = htons(len + hlen);
+
+ ip_send_check(iph);
+
+ err = output(sk, skb2);
+ if (err)
+ goto fail;
+
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ }
+ consume_skb(skb);
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ return err;
+
+fail:
+ kfree_skb(skb);
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ return err;
+}
+EXPORT_SYMBOL(ip_fragment);
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+ struct msghdr *msg = from;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (copy_from_iter(to, len, &msg->msg_iter) != len)
+ return -EFAULT;
+ } else {
+ __wsum csum = 0;
+ if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
+ return -EFAULT;
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ip_generic_getfrag);
+
+static inline __wsum
+csum_page(struct page *page, int offset, int copy)
+{
+ char *kaddr;
+ __wsum csum;
+ kaddr = kmap(page);
+ csum = csum_partial(kaddr + offset, copy, 0);
+ kunmap(page);
+ return csum;
+}
+
+static inline int ip_ufo_append_data(struct sock *sk,
+ struct sk_buff_head *queue,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int hh_len, int fragheaderlen,
+ int transhdrlen, int maxfraglen, unsigned int flags)
+{
+ struct sk_buff *skb;
+ int err;
+
+ /* There is support for UDP fragmentation offload by network
+ * device, so create one single skb packet containing complete
+ * udp datagram
+ */
+ skb = skb_peek_tail(queue);
+ if (!skb) {
+ skb = sock_alloc_send_skb(sk,
+ hh_len + fragheaderlen + transhdrlen + 20,
+ (flags & MSG_DONTWAIT), &err);
+
+ if (!skb)
+ return err;
+
+ /* reserve space for Hardware header */
+ skb_reserve(skb, hh_len);
+
+ /* create space for UDP/IP header */
+ skb_put(skb, fragheaderlen + transhdrlen);
+
+ /* initialize network header pointer */
+ skb_reset_network_header(skb);
+
+ /* initialize protocol header pointer */
+ skb->transport_header = skb->network_header + fragheaderlen;
+
+ skb->csum = 0;
+
+ __skb_queue_tail(queue, skb);
+ } else if (skb_is_gso(skb)) {
+ goto append;
+ }
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ /* specify the length of each IP datagram fragment */
+ skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+append:
+ return skb_append_datato_frags(sk, skb, getfrag, from,
+ (length - transhdrlen));
+}
+
+static int __ip_append_data(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork,
+ struct page_frag *pfrag,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ struct ip_options *opt = cork->opt;
+ int hh_len;
+ int exthdrlen;
+ int mtu;
+ int copy;
+ int err;
+ int offset = 0;
+ unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
+ int csummode = CHECKSUM_NONE;
+ struct rtable *rt = (struct rtable *)cork->dst;
+ u32 tskey = 0;
+
+ skb = skb_peek_tail(queue);
+
+ exthdrlen = !skb ? rt->dst.header_len : 0;
+ mtu = cork->fragsize;
+ if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+
+ if (cork->length + length > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
+ return -EMSGSIZE;
+ }
+
+ /*
+ * transhdrlen > 0 means that this is the first fragment and we wish
+ * it won't be fragmented in the future.
+ */
+ if (transhdrlen &&
+ length + fragheaderlen <= mtu &&
+ rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ !exthdrlen)
+ csummode = CHECKSUM_PARTIAL;
+
+ cork->length += length;
+ if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
+ (sk->sk_type == SOCK_DGRAM)) {
+ err = ip_ufo_append_data(sk, queue, getfrag, from, length,
+ hh_len, fragheaderlen, transhdrlen,
+ maxfraglen, flags);
+ if (err)
+ goto error;
+ return 0;
+ }
+
+ /* So, what's going on in the loop below?
+ *
+ * We use calculated fragment length to generate chained skb,
+ * each of segments is IP fragment ready for sending to network after
+ * adding appropriate IP header.
+ */
+
+ if (!skb)
+ goto alloc_new_skb;
+
+ while (length > 0) {
+ /* Check if the remaining data fits into current packet. */
+ copy = mtu - skb->len;
+ if (copy < length)
+ copy = maxfraglen - skb->len;
+ if (copy <= 0) {
+ char *data;
+ unsigned int datalen;
+ unsigned int fraglen;
+ unsigned int fraggap;
+ unsigned int alloclen;
+ struct sk_buff *skb_prev;
+alloc_new_skb:
+ skb_prev = skb;
+ if (skb_prev)
+ fraggap = skb_prev->len - maxfraglen;
+ else
+ fraggap = 0;
+
+ /*
+ * If remaining data exceeds the mtu,
+ * we know we need more fragment(s).
+ */
+ datalen = length + fraggap;
+ if (datalen > mtu - fragheaderlen)
+ datalen = maxfraglen - fragheaderlen;
+ fraglen = datalen + fragheaderlen;
+
+ if ((flags & MSG_MORE) &&
+ !(rt->dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else
+ alloclen = fraglen;
+
+ alloclen += exthdrlen;
+
+ /* The last fragment gets additional space at tail.
+ * Note, with MSG_MORE we overallocate on fragments,
+ * because we have no idea what fragment will be
+ * the last.
+ */
+ if (datalen == length + fraggap)
+ alloclen += rt->dst.trailer_len;
+
+ if (transhdrlen) {
+ skb = sock_alloc_send_skb(sk,
+ alloclen + hh_len + 15,
+ (flags & MSG_DONTWAIT), &err);
+ } else {
+ skb = NULL;
+ if (atomic_read(&sk->sk_wmem_alloc) <=
+ 2 * sk->sk_sndbuf)
+ skb = sock_wmalloc(sk,
+ alloclen + hh_len + 15, 1,
+ sk->sk_allocation);
+ if (unlikely(!skb))
+ err = -ENOBUFS;
+ }
+ if (!skb)
+ goto error;
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = csummode;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+
+ /*
+ * Find where to start putting bytes.
+ */
+ data = skb_put(skb, fraglen + exthdrlen);
+ skb_set_network_header(skb, exthdrlen);
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
+ data += fragheaderlen + exthdrlen;
+
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data + transhdrlen, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ data += fraggap;
+ pskb_trim_unique(skb_prev, maxfraglen);
+ }
+
+ copy = datalen - transhdrlen - fraggap;
+ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ offset += copy;
+ length -= datalen - fraggap;
+ transhdrlen = 0;
+ exthdrlen = 0;
+ csummode = CHECKSUM_NONE;
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(queue, skb);
+ continue;
+ }
+
+ if (copy > length)
+ copy = length;
+
+ if (!(rt->dst.dev->features&NETIF_F_SG)) {
+ unsigned int off;
+
+ off = skb->len;
+ if (getfrag(from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
+ __skb_trim(skb, off);
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ int i = skb_shinfo(skb)->nr_frags;
+
+ err = -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto error;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ err = -EMSGSIZE;
+ if (i == MAX_SKB_FRAGS)
+ goto error;
+
+ __skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, 0);
+ skb_shinfo(skb)->nr_frags = ++i;
+ get_page(pfrag->page);
+ }
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ if (getfrag(from,
+ page_address(pfrag->page) + pfrag->offset,
+ offset, copy, skb->len, skb) < 0)
+ goto error_efault;
+
+ pfrag->offset += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ atomic_add(copy, &sk->sk_wmem_alloc);
+ }
+ offset += copy;
+ length -= copy;
+ }
+
+ return 0;
+
+error_efault:
+ err = -EFAULT;
+error:
+ cork->length -= length;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+ struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+ struct ip_options_rcu *opt;
+ struct rtable *rt;
+
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (!cork->opt) {
+ cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+ sk->sk_allocation);
+ if (unlikely(!cork->opt))
+ return -ENOBUFS;
+ }
+ memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+ cork->flags |= IPCORK_OPT;
+ cork->addr = ipc->addr;
+ }
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
+ /*
+ * We steal reference to this route, caller should not release it
+ */
+ *rtp = NULL;
+ cork->fragsize = ip_sk_use_pmtu(sk) ?
+ dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ cork->dst = &rt->dst;
+ cork->length = 0;
+ cork->ttl = ipc->ttl;
+ cork->tos = ipc->tos;
+ cork->priority = ipc->priority;
+ cork->tx_flags = ipc->tx_flags;
+
+ return 0;
+}
+
+/*
+ * ip_append_data() and ip_append_page() can make one large IP datagram
+ * from many pieces of data. Each pieces will be holded on the socket
+ * until ip_push_pending_frames() is called. Each piece can be a page
+ * or non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+ if (err)
+ return err;
+ } else {
+ transhdrlen = 0;
+ }
+
+ return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+ sk_page_frag(sk), getfrag,
+ from, length, transhdrlen, flags);
+}
+
+ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct ip_options *opt = NULL;
+ struct inet_cork *cork;
+ int hh_len;
+ int mtu;
+ int len;
+ int err;
+ unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
+
+ if (inet->hdrincl)
+ return -EPERM;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue))
+ return -EINVAL;
+
+ cork = &inet->cork.base;
+ rt = (struct rtable *)cork->dst;
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
+
+ if (!(rt->dst.dev->features&NETIF_F_SG))
+ return -EOPNOTSUPP;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+ mtu = cork->fragsize;
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+
+ if (cork->length + size > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
+ return -EMSGSIZE;
+ }
+
+ skb = skb_peek_tail(&sk->sk_write_queue);
+ if (!skb)
+ return -EINVAL;
+
+ cork->length += size;
+ if ((size + skb->len > mtu) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO)) {
+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+ }
+
+ while (size > 0) {
+ int i;
+
+ if (skb_is_gso(skb))
+ len = size;
+ else {
+
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
+ }
+ if (len <= 0) {
+ struct sk_buff *skb_prev;
+ int alloclen;
+
+ skb_prev = skb;
+ fraggap = skb_prev->len - maxfraglen;
+
+ alloclen = fragheaderlen + hh_len + fraggap + 15;
+ skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ err = -ENOBUFS;
+ goto error;
+ }
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+ skb_put(skb, fragheaderlen + fraggap);
+ skb_reset_network_header(skb);
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(skb_prev,
+ maxfraglen,
+ skb_transport_header(skb),
+ fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ pskb_trim_unique(skb_prev, maxfraglen);
+ }
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ continue;
+ }
+
+ i = skb_shinfo(skb)->nr_frags;
+ if (len > size)
+ len = size;
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, len);
+ } else {
+ err = -EMSGSIZE;
+ goto error;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ __wsum csum;
+ csum = csum_page(page, offset, len);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+ atomic_add(len, &sk->sk_wmem_alloc);
+ offset += len;
+ size -= len;
+ }
+ return 0;
+
+error:
+ cork->length -= size;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+static void ip_cork_release(struct inet_cork *cork)
+{
+ cork->flags &= ~IPCORK_OPT;
+ kfree(cork->opt);
+ cork->opt = NULL;
+ dst_release(cork->dst);
+ cork->dst = NULL;
+}
+
+/*
+ * Combined all pending IP fragments on the socket as one IP datagram
+ * and push them out.
+ */
+struct sk_buff *__ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
+{
+ struct sk_buff *skb, *tmp_skb;
+ struct sk_buff **tail_skb;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ struct ip_options *opt = NULL;
+ struct rtable *rt = (struct rtable *)cork->dst;
+ struct iphdr *iph;
+ __be16 df = 0;
+ __u8 ttl;
+
+ skb = __skb_dequeue(queue);
+ if (!skb)
+ goto out;
+ tail_skb = &(skb_shinfo(skb)->frag_list);
+
+ /* move skb->data to ip header from ext header */
+ if (skb->data < skb_network_header(skb))
+ __skb_pull(skb, skb_network_offset(skb));
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+ __skb_pull(tmp_skb, skb_network_header_len(skb));
+ *tail_skb = tmp_skb;
+ tail_skb = &(tmp_skb->next);
+ skb->len += tmp_skb->len;
+ skb->data_len += tmp_skb->len;
+ skb->truesize += tmp_skb->truesize;
+ tmp_skb->destructor = NULL;
+ tmp_skb->sk = NULL;
+ }
+
+ /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+ * to fragment the frame generated here. No matter, what transforms
+ * how transforms change size of the packet, it will come out.
+ */
+ skb->ignore_df = ip_sk_ignore_df(sk);
+
+ /* DF bit is set when we want to see DF on outgoing frames.
+ * If ignore_df is set too, we still allow to fragment this frame
+ * locally. */
+ if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ (skb->len <= dst_mtu(&rt->dst) &&
+ ip_dont_fragment(sk, &rt->dst)))
+ df = htons(IP_DF);
+
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
+
+ if (cork->ttl != 0)
+ ttl = cork->ttl;
+ else if (rt->rt_type == RTN_MULTICAST)
+ ttl = inet->mc_ttl;
+ else
+ ttl = ip_select_ttl(inet, &rt->dst);
+
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+ iph->frag_off = df;
+ iph->ttl = ttl;
+ iph->protocol = sk->sk_protocol;
+ ip_copy_addrs(iph, fl4);
+ ip_select_ident(net, skb, sk);
+
+ if (opt) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, cork->addr, rt, 0);
+ }
+
+ skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ /*
+ * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
+ * on dst refcount
+ */
+ cork->dst = NULL;
+ skb_dst_set(skb, &rt->dst);
+
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
+
+ ip_cork_release(cork);
+out:
+ return skb;
+}
+
+int ip_send_skb(struct net *net, struct sk_buff *skb)
+{
+ int err;
+
+ err = ip_local_out(skb);
+ if (err) {
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+ }
+
+ return err;
+}
+
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
+{
+ struct sk_buff *skb;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ return 0;
+
+ /* Netfilter gets whole the not fragmented skb. */
+ return ip_send_skb(sock_net(sk), skb);
+}
+
+/*
+ * Throw away all pending data on the socket.
+ */
+static void __ip_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue_tail(queue)) != NULL)
+ kfree_skb(skb);
+
+ ip_cork_release(cork);
+}
+
+void ip_flush_pending_frames(struct sock *sk)
+{
+ __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+
+struct sk_buff *ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_cork cork;
+ struct sk_buff_head queue;
+ int err;
+
+ if (flags & MSG_PROBE)
+ return NULL;
+
+ __skb_queue_head_init(&queue);
+
+ cork.flags = 0;
+ cork.addr = 0;
+ cork.opt = NULL;
+ err = ip_setup_cork(sk, &cork, ipc, rtp);
+ if (err)
+ return ERR_PTR(err);
+
+ err = __ip_append_data(sk, fl4, &queue, &cork,
+ &current->task_frag, getfrag,
+ from, length, transhdrlen, flags);
+ if (err) {
+ __ip_flush_pending_frames(sk, &queue, &cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip_make_skb(sk, fl4, &queue, &cork);
+}
+
+/*
+ * Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+ int len, int odd, struct sk_buff *skb)
+{
+ __wsum csum;
+
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ return 0;
+}
+
+/*
+ * Generic function to send a packet as reply to another packet.
+ * Used to send some TCP resets/acks so far.
+ */
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+ const struct ip_options *sopt,
+ __be32 daddr, __be32 saddr,
+ const struct ip_reply_arg *arg,
+ unsigned int len)
+{
+ struct ip_options_data replyopts;
+ struct ipcm_cookie ipc;
+ struct flowi4 fl4;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
+ struct sk_buff *nskb;
+ int err;
+
+ if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
+ return;
+
+ ipc.addr = daddr;
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ if (replyopts.opt.opt.optlen) {
+ ipc.opt = &replyopts.opt;
+
+ if (replyopts.opt.opt.srr)
+ daddr = replyopts.opt.opt.faddr;
+ }
+
+ flowi4_init_output(&fl4, arg->bound_dev_if,
+ IP4_REPLY_MARK(net, skb->mark),
+ RT_TOS(arg->tos),
+ RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
+ ip_reply_arg_flowi_flags(arg),
+ daddr, saddr,
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return;
+
+ inet_sk(sk)->tos = arg->tos;
+
+ sk->sk_priority = skb->priority;
+ sk->sk_protocol = ip_hdr(skb)->protocol;
+ sk->sk_bound_dev_if = arg->bound_dev_if;
+ sk->sk_sndbuf = sysctl_wmem_default;
+ err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
+ len, 0, &ipc, &rt, MSG_DONTWAIT);
+ if (unlikely(err)) {
+ ip_flush_pending_frames(sk);
+ goto out;
+ }
+
+ nskb = skb_peek(&sk->sk_write_queue);
+ if (nskb) {
+ if (arg->csumoffset >= 0)
+ *((__sum16 *)skb_transport_header(nskb) +
+ arg->csumoffset) = csum_fold(csum_add(nskb->csum,
+ arg->csum));
+ nskb->ip_summed = CHECKSUM_NONE;
+ skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
+ ip_push_pending_frames(sk, &fl4);
+ }
+out:
+ ip_rt_put(rt);
+}
+
+void __init ip_init(void)
+{
+ ip_rt_init();
+ inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST)
+ igmp_mc_init();
+#endif
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 000000000..6ddde8999
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1544 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP to API glue.
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip.c for history.
+ * Martin Mares : TOS setting fixed.
+ * Alan Cox : Fixed a couple of oopses in Martin's
+ * TOS tweaks.
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp_states.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/inet_ecn.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+#include <net/checksum.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/transp_v6.h>
+#endif
+#include <net/ip_fib.h>
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+/*
+ * SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
+
+ info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
+
+ put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+ int ttl = ip_hdr(skb)->ttl;
+ put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+ put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
+ ip_hdr(skb) + 1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ if (ip_options_echo(opt, skb)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+ ip_options_undo(opt);
+
+ put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
+ int offset)
+{
+ __wsum csum = skb->csum;
+
+ if (skb->ip_summed != CHECKSUM_COMPLETE)
+ return;
+
+ if (offset != 0)
+ csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
+
+ put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
+}
+
+static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
+{
+ char *secdata;
+ u32 seclen, secid;
+ int err;
+
+ err = security_socket_getpeersec_dgram(NULL, skb, &secid);
+ if (err)
+ return;
+
+ err = security_secid_to_secctx(secid, &secdata, &seclen);
+ if (err)
+ return;
+
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
+ security_release_secctx(secdata, seclen);
+}
+
+static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct sockaddr_in sin;
+ const struct iphdr *iph = ip_hdr(skb);
+ __be16 *ports = (__be16 *)skb_transport_header(skb);
+
+ if (skb_transport_offset(skb) + 4 > skb->len)
+ return;
+
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = iph->daddr;
+ sin.sin_port = ports[1];
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
+}
+
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
+ int offset)
+{
+ struct inet_sock *inet = inet_sk(skb->sk);
+ unsigned int flags = inet->cmsg_flags;
+
+ /* Ordered by supposed usage frequency */
+ if (flags & IP_CMSG_PKTINFO) {
+ ip_cmsg_recv_pktinfo(msg, skb);
+
+ flags &= ~IP_CMSG_PKTINFO;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_TTL) {
+ ip_cmsg_recv_ttl(msg, skb);
+
+ flags &= ~IP_CMSG_TTL;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_TOS) {
+ ip_cmsg_recv_tos(msg, skb);
+
+ flags &= ~IP_CMSG_TOS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_RECVOPTS) {
+ ip_cmsg_recv_opts(msg, skb);
+
+ flags &= ~IP_CMSG_RECVOPTS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_RETOPTS) {
+ ip_cmsg_recv_retopts(msg, skb);
+
+ flags &= ~IP_CMSG_RETOPTS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_PASSSEC) {
+ ip_cmsg_recv_security(msg, skb);
+
+ flags &= ~IP_CMSG_PASSSEC;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_ORIGDSTADDR) {
+ ip_cmsg_recv_dstaddr(msg, skb);
+
+ flags &= ~IP_CMSG_ORIGDSTADDR;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_CHECKSUM)
+ ip_cmsg_recv_checksum(msg, skb, offset);
+}
+EXPORT_SYMBOL(ip_cmsg_recv_offset);
+
+int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+ bool allow_ipv6)
+{
+ int err, val;
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (allow_ipv6 &&
+ cmsg->cmsg_level == SOL_IPV6 &&
+ cmsg->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *src_info;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+ return -EINVAL;
+ src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+ return -EINVAL;
+ ipc->oif = src_info->ipi6_ifindex;
+ ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+ continue;
+ }
+#endif
+ if (cmsg->cmsg_level != SOL_IP)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case IP_RETOPTS:
+ err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
+ err < 40 ? err : 40);
+ if (err)
+ return err;
+ break;
+ case IP_PKTINFO:
+ {
+ struct in_pktinfo *info;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+ return -EINVAL;
+ info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+ ipc->oif = info->ipi_ifindex;
+ ipc->addr = info->ipi_spec_dst.s_addr;
+ break;
+ }
+ case IP_TTL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->ttl = val;
+ break;
+ case IP_TOS:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ ipc->tos = val;
+ ipc->priority = rt_tos2priority(ipc->tos);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+
+/* Special input handler for packets caught by router alert option.
+ They are selected only by protocol field, and then processed likely
+ local ones; but only if someone wants them! Otherwise, router
+ not running rsvpd will kill RSVP.
+
+ It is user level problem, what it will make with them.
+ I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+ but receiver should be enough clever f.e. to forward mtrace requests,
+ sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
+
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+ sock_put(ra->saved_sk);
+ kfree(ra);
+}
+
+int ip_ra_control(struct sock *sk, unsigned char on,
+ void (*destructor)(struct sock *))
+{
+ struct ip_ra_chain *ra, *new_ra;
+ struct ip_ra_chain __rcu **rap;
+
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
+ return -EINVAL;
+
+ new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+ spin_lock_bh(&ip_ra_lock);
+ for (rap = &ip_ra_chain;
+ (ra = rcu_dereference_protected(*rap,
+ lockdep_is_held(&ip_ra_lock))) != NULL;
+ rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (on) {
+ spin_unlock_bh(&ip_ra_lock);
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
+ /* dont let ip_call_ra_chain() use sk again */
+ ra->sk = NULL;
+ RCU_INIT_POINTER(*rap, ra->next);
+ spin_unlock_bh(&ip_ra_lock);
+
+ if (ra->destructor)
+ ra->destructor(sk);
+ /*
+ * Delay sock_put(sk) and kfree(ra) after one rcu grace
+ * period. This guarantee ip_call_ra_chain() dont need
+ * to mess with socket refcounts.
+ */
+ ra->saved_sk = sk;
+ call_rcu(&ra->rcu, ip_ra_destroy_rcu);
+ return 0;
+ }
+ }
+ if (!new_ra) {
+ spin_unlock_bh(&ip_ra_lock);
+ return -ENOBUFS;
+ }
+ new_ra->sk = sk;
+ new_ra->destructor = destructor;
+
+ RCU_INIT_POINTER(new_ra->next, ra);
+ rcu_assign_pointer(*rap, new_ra);
+ sock_hold(sk);
+ spin_unlock_bh(&ip_ra_lock);
+
+ return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload)
+{
+ struct sock_exterr_skb *serr;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+ serr->ee.ee_type = icmp_hdr(skb)->type;
+ serr->ee.ee_code = icmp_hdr(skb)->code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
+ skb_network_header(skb);
+ serr->port = port;
+
+ if (skb_pull(skb, payload - skb->data)) {
+ skb_reset_transport_header(skb);
+ if (sock_queue_err_skb(sk, skb) == 0)
+ return;
+ }
+ kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sock_exterr_skb *serr;
+ struct iphdr *iph;
+ struct sk_buff *skb;
+
+ if (!inet->recverr)
+ return;
+
+ skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_put(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ iph->daddr = daddr;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = 0;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+ serr->port = port;
+
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+ return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
+/* IPv4 supports cmsg on all imcp errors and some timestamps
+ *
+ * Timestamp code paths do not initialize the fields expected by cmsg:
+ * the PKTINFO fields in skb->cb[]. Fill those in here.
+ */
+static bool ipv4_datagram_support_cmsg(const struct sock *sk,
+ struct sk_buff *skb,
+ int ee_origin)
+{
+ struct in_pktinfo *info;
+
+ if (ee_origin == SO_EE_ORIGIN_ICMP)
+ return true;
+
+ if (ee_origin == SO_EE_ORIGIN_LOCAL)
+ return false;
+
+ /* Support IP_PKTINFO on tstamp packets if requested, to correlate
+ * timestamp with egress dev. Not possible for packets without dev
+ * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
+ */
+ if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
+ (!skb->dev))
+ return false;
+
+ info = PKTINFO_SKB_CB(skb);
+ info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
+ info->ipi_ifindex = skb->dev->ifindex;
+ return true;
+}
+
+/*
+ * Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct {
+ struct sock_extended_err ee;
+ struct sockaddr_in offender;
+ } errhdr;
+ int err;
+ int copied;
+
+ WARN_ON_ONCE(sk->sk_family == AF_INET6);
+
+ err = -EAGAIN;
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+
+ if (sin && ipv4_datagram_support_addr(serr)) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
+ serr->addr_offset);
+ sin->sin_port = serr->port;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+ sin = &errhdr.offender;
+ memset(sin, 0, sizeof(*sin));
+
+ if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ if (inet_sk(sk)->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ }
+
+ put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+ /* Now we could try to dump offended packet options */
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+
+/*
+ * Socket option code for IP. This is the end of the line after any
+ * TCP,UDP etc options on an IP socket.
+ */
+static bool setsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_BLOCK_SOURCE:
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case IP_MSFILTER:
+ case IP_UNBLOCK_SOURCE:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_MSFILTER:
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_UNBLOCK_SOURCE:
+ return true;
+ }
+ return false;
+}
+
+static int do_ip_setsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val = 0, err;
+ bool needs_rtnl = setsockopt_needs_rtnl(optname);
+
+ switch (optname) {
+ case IP_PKTINFO:
+ case IP_RECVTTL:
+ case IP_RECVOPTS:
+ case IP_RECVTOS:
+ case IP_RETOPTS:
+ case IP_TOS:
+ case IP_TTL:
+ case IP_HDRINCL:
+ case IP_MTU_DISCOVER:
+ case IP_RECVERR:
+ case IP_ROUTER_ALERT:
+ case IP_FREEBIND:
+ case IP_PASSSEC:
+ case IP_TRANSPARENT:
+ case IP_MINTTL:
+ case IP_NODEFRAG:
+ case IP_UNICAST_IF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_ALL:
+ case IP_MULTICAST_LOOP:
+ case IP_RECVORIGDSTADDR:
+ case IP_CHECKSUM:
+ if (optlen >= sizeof(int)) {
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+ } else if (optlen >= sizeof(char)) {
+ unsigned char ucval;
+
+ if (get_user(ucval, (unsigned char __user *) optval))
+ return -EFAULT;
+ val = (int) ucval;
+ }
+ }
+
+ /* If optlen==0, it is equivalent to val == 0 */
+
+ if (ip_mroute_opt(optname))
+ return ip_mroute_setsockopt(sk, optname, optval, optlen);
+
+ err = 0;
+ if (needs_rtnl)
+ rtnl_lock();
+ lock_sock(sk);
+
+ switch (optname) {
+ case IP_OPTIONS:
+ {
+ struct ip_options_rcu *old, *opt = NULL;
+
+ if (optlen > 40)
+ goto e_inval;
+ err = ip_options_get_from_user(sock_net(sk), &opt,
+ optval, optlen);
+ if (err)
+ break;
+ old = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet->is_icsk) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == PF_INET ||
+ (!((1 << sk->sk_state) &
+ (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet->inet_daddr != LOOPBACK4_IPV6)) {
+#endif
+ if (old)
+ icsk->icsk_ext_hdr_len -= old->opt.optlen;
+ if (opt)
+ icsk->icsk_ext_hdr_len += opt->opt.optlen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+#if IS_ENABLED(CONFIG_IPV6)
+ }
+#endif
+ }
+ rcu_assign_pointer(inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ case IP_PKTINFO:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_PKTINFO;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+ break;
+ case IP_RECVTTL:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TTL;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TTL;
+ break;
+ case IP_RECVTOS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TOS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TOS;
+ break;
+ case IP_RECVOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+ break;
+ case IP_RETOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RETOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+ break;
+ case IP_PASSSEC:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_PASSSEC;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
+ break;
+ case IP_RECVORIGDSTADDR:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
+ break;
+ case IP_CHECKSUM:
+ if (val) {
+ if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
+ inet_inc_convert_csum(sk);
+ inet->cmsg_flags |= IP_CMSG_CHECKSUM;
+ }
+ } else {
+ if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
+ inet_dec_convert_csum(sk);
+ inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
+ }
+ }
+ break;
+ case IP_TOS: /* This sets both TOS and Precedence */
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= inet->tos & INET_ECN_MASK;
+ }
+ if (inet->tos != val) {
+ inet->tos = val;
+ sk->sk_priority = rt_tos2priority(val);
+ sk_dst_reset(sk);
+ }
+ break;
+ case IP_TTL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val != -1 && (val < 1 || val > 255))
+ goto e_inval;
+ inet->uc_ttl = val;
+ break;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->hdrincl = val ? 1 : 0;
+ break;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->nodefrag = val ? 1 : 0;
+ break;
+ case IP_MTU_DISCOVER:
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+ goto e_inval;
+ inet->pmtudisc = val;
+ break;
+ case IP_RECVERR:
+ inet->recverr = !!val;
+ if (!val)
+ skb_queue_purge(&sk->sk_error_queue);
+ break;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ if (optlen < 1)
+ goto e_inval;
+ if (val == -1)
+ val = 1;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->mc_ttl = val;
+ break;
+ case IP_MULTICAST_LOOP:
+ if (optlen < 1)
+ goto e_inval;
+ inet->mc_loop = !!val;
+ break;
+ case IP_UNICAST_IF:
+ {
+ struct net_device *dev = NULL;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ goto e_inval;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (ifindex == 0) {
+ inet->uc_index = 0;
+ err = 0;
+ break;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if)
+ break;
+
+ inet->uc_index = ifindex;
+ err = 0;
+ break;
+ }
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ struct net_device *dev = NULL;
+
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ /*
+ * Check the arguments are allowable
+ */
+
+ if (optlen < sizeof(struct in_addr))
+ goto e_inval;
+
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq, optval, sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (optlen >= sizeof(struct ip_mreq)) {
+ if (copy_from_user(&mreq, optval,
+ sizeof(struct ip_mreq)))
+ break;
+ } else if (optlen >= sizeof(struct in_addr)) {
+ if (copy_from_user(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
+ break;
+ }
+ }
+
+ if (!mreq.imr_ifindex) {
+ if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
+ inet->mc_index = 0;
+ inet->mc_addr = 0;
+ err = 0;
+ break;
+ }
+ dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
+ if (dev)
+ mreq.imr_ifindex = dev->ifindex;
+ } else
+ dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
+
+
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if &&
+ mreq.imr_ifindex != sk->sk_bound_dev_if)
+ break;
+
+ inet->mc_index = mreq.imr_ifindex;
+ inet->mc_addr = mreq.imr_address.s_addr;
+ err = 0;
+ break;
+ }
+
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ {
+ struct ip_mreqn mreq;
+
+ err = -EPROTO;
+ if (inet_sk(sk)->is_icsk)
+ break;
+
+ if (optlen < sizeof(struct ip_mreq))
+ goto e_inval;
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq, optval, sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
+ break;
+ }
+
+ if (optname == IP_ADD_MEMBERSHIP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter *msf;
+
+ if (optlen < IP_MSFILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ msf = kmalloc(optlen, GFP_KERNEL);
+ if (!msf) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(msf, optval, optlen)) {
+ kfree(msf);
+ break;
+ }
+ /* numsrc >= (1G-4) overflow in 32 bits */
+ if (msf->imsf_numsrc >= 0x3ffffffcU ||
+ msf->imsf_numsrc > sysctl_igmp_max_msf) {
+ kfree(msf);
+ err = -ENOBUFS;
+ break;
+ }
+ if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+ kfree(msf);
+ err = -EINVAL;
+ break;
+ }
+ err = ip_mc_msfilter(sk, msf, 0);
+ kfree(msf);
+ break;
+ }
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ {
+ struct ip_mreq_source mreqs;
+ int omode, add;
+
+ if (optlen != sizeof(struct ip_mreq_source))
+ goto e_inval;
+ if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (optname == IP_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == IP_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+ struct ip_mreqn mreq;
+
+ mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+ mreq.imr_address.s_addr = mreqs.imr_interface;
+ mreq.imr_ifindex = 0;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err && err != -EADDRINUSE)
+ break;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs, 0);
+ break;
+ }
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ {
+ struct group_req greq;
+ struct sockaddr_in *psin;
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct group_req))
+ goto e_inval;
+ err = -EFAULT;
+ if (copy_from_user(&greq, optval, sizeof(greq)))
+ break;
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ goto e_inval;
+ memset(&mreq, 0, sizeof(mreq));
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+
+ if (optname == MCAST_JOIN_GROUP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ {
+ struct group_source_req greqs;
+ struct ip_mreq_source mreqs;
+ struct sockaddr_in *psin;
+ int omode, add;
+
+ if (optlen != sizeof(struct group_source_req))
+ goto e_inval;
+ if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (greqs.gsr_group.ss_family != AF_INET ||
+ greqs.gsr_source.ss_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ break;
+ }
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+ psin = (struct sockaddr_in *)&greqs.gsr_source;
+ mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+ mreqs.imr_interface = 0; /* use index for mc_source */
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct ip_mreqn mreq;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_address.s_addr = 0;
+ mreq.imr_ifindex = greqs.gsr_interface;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err && err != -EADDRINUSE)
+ break;
+ greqs.gsr_interface = mreq.imr_ifindex;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs,
+ greqs.gsr_interface);
+ break;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct sockaddr_in *psin;
+ struct ip_msfilter *msf = NULL;
+ struct group_filter *gsf = NULL;
+ int msize, i, ifindex;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ gsf = kmalloc(optlen, GFP_KERNEL);
+ if (!gsf) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(gsf, optval, optlen))
+ goto mc_msf_out;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ if (gsf->gf_numsrc >= 0x1ffffff ||
+ gsf->gf_numsrc > sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+ err = -EINVAL;
+ goto mc_msf_out;
+ }
+ msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+ msf = kmalloc(msize, GFP_KERNEL);
+ if (!msf) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ ifindex = gsf->gf_interface;
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ goto mc_msf_out;
+ }
+ msf->imsf_multiaddr = psin->sin_addr.s_addr;
+ msf->imsf_interface = 0;
+ msf->imsf_fmode = gsf->gf_fmode;
+ msf->imsf_numsrc = gsf->gf_numsrc;
+ err = -EADDRNOTAVAIL;
+ for (i = 0; i < gsf->gf_numsrc; ++i) {
+ psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+
+ if (psin->sin_family != AF_INET)
+ goto mc_msf_out;
+ msf->imsf_slist[i] = psin->sin_addr.s_addr;
+ }
+ kfree(gsf);
+ gsf = NULL;
+
+ err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+ kfree(msf);
+ kfree(gsf);
+ break;
+ }
+ case IP_MULTICAST_ALL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val != 0 && val != 1)
+ goto e_inval;
+ inet->mc_all = val;
+ break;
+ case IP_ROUTER_ALERT:
+ err = ip_ra_control(sk, val ? 1 : 0, NULL);
+ break;
+
+ case IP_FREEBIND:
+ if (optlen < 1)
+ goto e_inval;
+ inet->freebind = !!val;
+ break;
+
+ case IP_IPSEC_POLICY:
+ case IP_XFRM_POLICY:
+ err = -EPERM;
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ break;
+ err = xfrm_user_policy(sk, optname, optval, optlen);
+ break;
+
+ case IP_TRANSPARENT:
+ if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (optlen < 1)
+ goto e_inval;
+ inet->transparent = !!val;
+ break;
+
+ case IP_MINTTL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->min_ttl = val;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
+
+e_inval:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return -EINVAL;
+}
+
+/**
+ * ipv4_pktinfo_prepare - transfer some info from rtable to skb
+ * @sk: socket
+ * @skb: buffer
+ *
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
+ * This way, receiver doesn't make cache line misses to read rtable.
+ */
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
+{
+ struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
+ ipv6_sk_rxinfo(sk);
+
+ if (prepare && skb_rtable(skb)) {
+ pktinfo->ipi_ifindex = inet_iif(skb);
+ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
+ } else {
+ pktinfo->ipi_ifindex = 0;
+ pktinfo->ipi_spec_dst.s_addr = 0;
+ }
+ skb_dst_drop(skb);
+}
+
+int ip_setsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ int err;
+
+ if (level != SOL_IP)
+ return -ENOPROTOOPT;
+
+ err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+ optname != IP_IPSEC_POLICY &&
+ optname != IP_XFRM_POLICY &&
+ !ip_mroute_opt(optname)) {
+ lock_sock(sk);
+ err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
+ release_sock(sk);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(ip_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ip_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ int err;
+
+ if (level != SOL_IP)
+ return -ENOPROTOOPT;
+
+ if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+ return compat_mc_setsockopt(sk, level, optname, optval, optlen,
+ ip_setsockopt);
+
+ err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+ optname != IP_IPSEC_POLICY &&
+ optname != IP_XFRM_POLICY &&
+ !ip_mroute_opt(optname)) {
+ lock_sock(sk);
+ err = compat_nf_setsockopt(sk, PF_INET, optname,
+ optval, optlen);
+ release_sock(sk);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(compat_ip_setsockopt);
+#endif
+
+/*
+ * Get the options. Note for future reference. The GET of IP options gets
+ * the _received_ ones. The set sets the _sent_ ones.
+ */
+
+static int do_ip_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen, unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val;
+ int len;
+
+ if (level != SOL_IP)
+ return -EOPNOTSUPP;
+
+ if (ip_mroute_opt(optname))
+ return ip_mroute_getsockopt(sk, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case IP_OPTIONS:
+ {
+ unsigned char optbuf[sizeof(struct ip_options)+40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ opt->optlen = 0;
+ if (inet_opt)
+ memcpy(optbuf, &inet_opt->opt,
+ sizeof(struct ip_options) +
+ inet_opt->opt.optlen);
+ release_sock(sk);
+
+ if (opt->optlen == 0)
+ return put_user(0, optlen);
+
+ ip_options_undo(opt);
+
+ len = min_t(unsigned int, len, opt->optlen);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, opt->__data, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_PKTINFO:
+ val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+ break;
+ case IP_RECVTTL:
+ val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+ break;
+ case IP_RECVTOS:
+ val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+ break;
+ case IP_RECVOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+ break;
+ case IP_RETOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+ break;
+ case IP_PASSSEC:
+ val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
+ break;
+ case IP_RECVORIGDSTADDR:
+ val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
+ break;
+ case IP_CHECKSUM:
+ val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
+ break;
+ case IP_TOS:
+ val = inet->tos;
+ break;
+ case IP_TTL:
+ val = (inet->uc_ttl == -1 ?
+ sysctl_ip_default_ttl :
+ inet->uc_ttl);
+ break;
+ case IP_HDRINCL:
+ val = inet->hdrincl;
+ break;
+ case IP_NODEFRAG:
+ val = inet->nodefrag;
+ break;
+ case IP_MTU_DISCOVER:
+ val = inet->pmtudisc;
+ break;
+ case IP_MTU:
+ {
+ struct dst_entry *dst;
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+ val = dst_mtu(dst);
+ dst_release(dst);
+ }
+ if (!val) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+ break;
+ }
+ case IP_RECVERR:
+ val = inet->recverr;
+ break;
+ case IP_MULTICAST_TTL:
+ val = inet->mc_ttl;
+ break;
+ case IP_MULTICAST_LOOP:
+ val = inet->mc_loop;
+ break;
+ case IP_UNICAST_IF:
+ val = (__force int)htonl((__u32) inet->uc_index);
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct in_addr addr;
+ len = min_t(unsigned int, len, sizeof(struct in_addr));
+ addr.s_addr = inet->mc_addr;
+ release_sock(sk);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &addr, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter msf;
+ int err;
+
+ if (len < IP_MSFILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_msfget(sk, &msf,
+ (struct ip_msfilter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct group_filter gsf;
+ int err;
+
+ if (len < GROUP_FILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_gsfget(sk, &gsf,
+ (struct group_filter __user *)optval,
+ optlen);
+ release_sock(sk);
+ return err;
+ }
+ case IP_MULTICAST_ALL:
+ val = inet->mc_all;
+ break;
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ release_sock(sk);
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ msg.msg_control = (__force void *) optval;
+ msg.msg_controllen = len;
+ msg.msg_flags = flags;
+
+ if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = inet->inet_rcv_saddr;
+ info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
+ info.ipi_ifindex = inet->mc_index;
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet->cmsg_flags & IP_CMSG_TTL) {
+ int hlim = inet->mc_ttl;
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ if (inet->cmsg_flags & IP_CMSG_TOS) {
+ int tos = inet->rcv_tos;
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
+ len -= msg.msg_controllen;
+ return put_user(len, optlen);
+ }
+ case IP_FREEBIND:
+ val = inet->freebind;
+ break;
+ case IP_TRANSPARENT:
+ val = inet->transparent;
+ break;
+ case IP_MINTTL:
+ val = inet->min_ttl;
+ break;
+ default:
+ release_sock(sk);
+ return -ENOPROTOOPT;
+ }
+ release_sock(sk);
+
+ if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
+ unsigned char ucval = (unsigned char)val;
+ len = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ucval, 1))
+ return -EFAULT;
+ } else {
+ len = min_t(unsigned int, sizeof(int), len);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+int ip_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, int __user *optlen)
+{
+ int err;
+
+ err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+ !ip_mroute_opt(optname)) {
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ err = nf_getsockopt(sk, PF_INET, optname, optval,
+ &len);
+ release_sock(sk);
+ if (err >= 0)
+ err = put_user(len, optlen);
+ return err;
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(ip_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ip_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int err;
+
+ if (optname == MCAST_MSFILTER)
+ return compat_mc_getsockopt(sk, level, optname, optval, optlen,
+ ip_getsockopt);
+
+ err = do_ip_getsockopt(sk, level, optname, optval, optlen,
+ MSG_CMSG_COMPAT);
+
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+ !ip_mroute_opt(optname)) {
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
+ release_sock(sk);
+ if (err >= 0)
+ err = put_user(len, optlen);
+ return err;
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(compat_ip_getsockopt);
+#endif
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 000000000..4c2c3ba4b
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1191 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+#include <linux/err.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/udp.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
+{
+ return hash_32((__force u32)key ^ (__force u32)remote,
+ IP_TNL_HASH_BITS);
+}
+
+static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
+ struct dst_entry *dst, __be32 saddr)
+{
+ struct dst_entry *old_dst;
+
+ dst_clone(dst);
+ old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
+ dst_release(old_dst);
+ idst->saddr = saddr;
+}
+
+static noinline void tunnel_dst_set(struct ip_tunnel *t,
+ struct dst_entry *dst, __be32 saddr)
+{
+ __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
+}
+
+static void tunnel_dst_reset(struct ip_tunnel *t)
+{
+ tunnel_dst_set(t, NULL, 0);
+}
+
+void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
+}
+EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
+
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
+ u32 cookie, __be32 *saddr)
+{
+ struct ip_tunnel_dst *idst;
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ idst = raw_cpu_ptr(t->dst_cache);
+ dst = rcu_dereference(idst->dst);
+ if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+ dst = NULL;
+ if (dst) {
+ if (!dst->obsolete || dst->ops->check(dst, cookie)) {
+ *saddr = idst->saddr;
+ } else {
+ tunnel_dst_reset(t);
+ dst_release(dst);
+ dst = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return (struct rtable *)dst;
+}
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+ __be16 flags, __be32 key)
+{
+ if (p->i_flags & TUNNEL_KEY) {
+ if (flags & TUNNEL_KEY)
+ return key == p->i_key;
+ else
+ /* key expected, none present */
+ return false;
+ } else
+ return !(flags & TUNNEL_KEY);
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+ Tunnel hash table:
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+ int link, __be16 flags,
+ __be32 remote, __be32 local,
+ __be32 key)
+{
+ unsigned int hash;
+ struct ip_tunnel *t, *cand = NULL;
+ struct hlist_head *head;
+
+ hash = ip_tunnel_hash(key, remote);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (remote != t->parms.iph.daddr ||
+ t->parms.iph.saddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ hash = ip_tunnel_hash(key, 0);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+ (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+ continue;
+
+ if (!(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ if (flags & TUNNEL_NO_KEY)
+ goto skip_key_lookup;
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (t->parms.i_key != key ||
+ t->parms.iph.saddr != 0 ||
+ t->parms.iph.daddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+skip_key_lookup:
+ if (cand)
+ return cand;
+
+ if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
+ return netdev_priv(itn->fb_tunnel_dev);
+
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ unsigned int h;
+ __be32 remote;
+ __be32 i_key = parms->i_key;
+
+ if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+ remote = parms->iph.daddr;
+ else
+ remote = 0;
+
+ if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+ i_key = 0;
+
+ h = ip_tunnel_hash(i_key, remote);
+ return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+ hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel *t)
+{
+ hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms,
+ int type)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ __be32 key = parms->i_key;
+ __be16 flags = parms->i_flags;
+ int link = parms->link;
+ struct ip_tunnel *t = NULL;
+ struct hlist_head *head = ip_bucket(itn, parms);
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ link == t->parms.link &&
+ type == t->dev->type &&
+ ip_tunnel_key_match(&t->parms, flags, key))
+ break;
+ }
+ return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+ const struct rtnl_link_ops *ops,
+ struct ip_tunnel_parm *parms)
+{
+ int err;
+ struct ip_tunnel *tunnel;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
+ err = -E2BIG;
+ goto failed;
+ }
+ strlcpy(name, ops->kind, IFNAMSIZ);
+ strncat(name, "%d", 2);
+ }
+
+ ASSERT_RTNL();
+ dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
+ if (!dev) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ dev_net_set(dev, net);
+
+ dev->rtnl_link_ops = ops;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms = *parms;
+ tunnel->net = net;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto failed_free;
+
+ return dev;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static inline void init_tunnel_flow(struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
+{
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_oif = oif;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->flowi4_tos = tos;
+ fl4->flowi4_proto = proto;
+ fl4->fl4_gre_key = key;
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+ iph->saddr, tunnel->parms.o_key,
+ RT_TOS(iph->tos), tunnel->parms.link);
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = tdev->mtu;
+ }
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= (dev->hard_header_len + t_hlen);
+
+ if (mtu < 68)
+ mtu = 68;
+
+ return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+ struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ struct ip_tunnel *nt;
+ struct net_device *dev;
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+ if (IS_ERR(dev))
+ return ERR_CAST(dev);
+
+ dev->mtu = ip_tunnel_bind_dev(dev);
+
+ nt = netdev_priv(dev);
+ ip_tunnel_add(itn, nt);
+ return nt;
+}
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi, bool log_ecn_error)
+{
+ struct pcpu_sw_netstats *tstats;
+ const struct iphdr *iph = ip_hdr(skb);
+ int err;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ tunnel->dev->stats.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
+ ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ if (tunnel->parms.i_flags&TUNNEL_SEQ) {
+ if (!(tpi->flags&TUNNEL_SEQ) ||
+ (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
+
+ skb_reset_network_header(skb);
+
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ }
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+static int ip_encap_hlen(struct ip_tunnel_encap *e)
+{
+ const struct ip_tunnel_encap_ops *ops;
+ int hlen = -EINVAL;
+
+ if (e->type == TUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (e->type >= MAX_IPTUN_ENCAP_OPS)
+ return -EINVAL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(iptun_encaps[e->type]);
+ if (likely(ops && ops->encap_hlen))
+ hlen = ops->encap_hlen(e);
+ rcu_read_unlock();
+
+ return hlen;
+}
+
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+
+int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
+
+int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
+
+int ip_tunnel_encap_setup(struct ip_tunnel *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
+
+int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ const struct ip_tunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (t->encap.type == TUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
+ return -EINVAL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(iptun_encaps[t->encap.type]);
+ if (likely(ops && ops->build_header))
+ ret = ops->build_header(skb, &t->encap, protocol, fl4);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap);
+
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+ struct rtable *rt, __be16 df)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+ int mtu;
+
+ if (df)
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+ - sizeof(struct iphdr) - tunnel->hlen;
+ else
+ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (!skb_is_gso(skb) &&
+ (df & htons(IP_DF)) && mtu < pkt_size) {
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ return -E2BIG;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < pkt_size) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -E2BIG;
+ }
+ }
+#endif
+ return 0;
+}
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params, u8 protocol)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *inner_iph;
+ struct flowi4 fl4;
+ u8 tos, ttl;
+ __be16 df;
+ struct rtable *rt; /* Route to the other host */
+ unsigned int max_headroom; /* The extra header space needed */
+ __be32 dst;
+ int err;
+ bool connected;
+
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ connected = (tunnel->parms.iph.daddr != 0);
+
+ dst = tnl_params->daddr;
+ if (dst == 0) {
+ /* NBMA tunnel */
+
+ if (!skb_dst(skb)) {
+ dev->stats.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = skb_rtable(skb);
+ dst = rt_nexthop(rt, inner_iph->daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
+ int addr_type;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_error;
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
+ goto tx_error_icmp;
+ }
+#endif
+ else
+ goto tx_error;
+
+ connected = false;
+ }
+
+ tos = tnl_params->tos;
+ if (tos & 0x1) {
+ tos &= ~0x1;
+ if (skb->protocol == htons(ETH_P_IP)) {
+ tos = inner_iph->tos;
+ connected = false;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ connected = false;
+ }
+ }
+
+ init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+ tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+ if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+ goto tx_error;
+
+ rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
+
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (connected)
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ }
+
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = tnl_params->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ df = tnl_params->frag_off;
+ if (skb->protocol == htons(ETH_P_IP))
+ df |= (inner_iph->frag_off&htons(IP_DF));
+
+ max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return;
+ }
+
+ err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
+ tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+ dst_link_failure(skb);
+#endif
+tx_error:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+ struct ip_tunnel *t,
+ struct net_device *dev,
+ struct ip_tunnel_parm *p,
+ bool set_mtu)
+{
+ ip_tunnel_del(t);
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(dev->broadcast, &p->iph.daddr, 4);
+ }
+ ip_tunnel_add(itn, t);
+
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+
+ if (t->parms.link != p->link) {
+ int mtu;
+
+ t->parms.link = p->link;
+ mtu = ip_tunnel_bind_dev(dev);
+ if (set_mtu)
+ dev->mtu = mtu;
+ }
+ ip_tunnel_dst_reset_all(t);
+ netdev_state_change(dev);
+}
+
+int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == itn->fb_tunnel_dev) {
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (!t)
+ t = netdev_priv(dev);
+ }
+ memcpy(p, &t->parms, sizeof(*p));
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ if (!(p->i_flags & VTI_ISVTI)) {
+ if (!(p->i_flags & TUNNEL_KEY))
+ p->i_key = 0;
+ if (!(p->o_flags & TUNNEL_KEY))
+ p->o_key = 0;
+ }
+
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+
+ if (cmd == SIOCADDTUNNEL) {
+ if (!t) {
+ t = ip_tunnel_create(net, itn, p);
+ err = PTR_ERR_OR_ZERO(t);
+ break;
+ }
+
+ err = -EEXIST;
+ break;
+ }
+ if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+
+ t = netdev_priv(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ ip_tunnel_update(itn, t, dev, p, true);
+ } else {
+ err = -ENOENT;
+ }
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == itn->fb_tunnel_dev) {
+ err = -ENOENT;
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (!t)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(itn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ gro_cells_destroy(&tunnel->gro_cells);
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
+
+ if (itn->fb_tunnel_dev != dev) {
+ ip_tunnel_del(netdev_priv(dev));
+ unregister_netdevice_queue(dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return tunnel->net;
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
+int ip_tunnel_get_iflink(const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return tunnel->parms.link;
+}
+EXPORT_SYMBOL(ip_tunnel_get_iflink);
+
+int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+ struct ip_tunnel_parm parms;
+ unsigned int i;
+
+ for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&itn->tunnels[i]);
+
+ if (!ops) {
+ itn->fb_tunnel_dev = NULL;
+ return 0;
+ }
+
+ memset(&parms, 0, sizeof(parms));
+ if (devname)
+ strlcpy(parms.name, devname, IFNAMSIZ);
+
+ rtnl_lock();
+ itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ if (!IS_ERR(itn->fb_tunnel_dev)) {
+ itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+ ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ }
+ rtnl_unlock();
+
+ return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+ struct rtnl_link_ops *ops)
+{
+ struct net *net = dev_net(itn->fb_tunnel_dev);
+ struct net_device *dev, *aux;
+ int h;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ struct hlist_node *n;
+ struct hlist_head *thead = &itn->tunnels[h];
+
+ hlist_for_each_entry_safe(t, n, thead, hash_node)
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
+ }
+}
+
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip_tunnel_destroy(itn, &list, ops);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+
+int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+
+ nt->net = net;
+ nt->parms = *p;
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ ip_tunnel_add(itn, nt);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (dev == itn->fb_tunnel_dev)
+ return -EINVAL;
+
+ t = ip_tunnel_find(itn, p, dev->type);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = tunnel;
+
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
+ }
+
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ int err;
+
+ dev->destructor = ip_tunnel_dev_free;
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
+ if (!tunnel->dst_cache) {
+ free_percpu(dev->tstats);
+ return -ENOMEM;
+ }
+
+ err = gro_cells_init(&tunnel->gro_cells, dev);
+ if (err) {
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ /* fb_tunnel_dev will be unregisted in net-exit call. */
+ if (itn->fb_tunnel_dev != dev)
+ ip_tunnel_del(netdev_priv(dev));
+
+ ip_tunnel_dst_reset_all(tunnel);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
new file mode 100644
index 000000000..ce63ab21b
--- /dev/null
+++ b/net/ipv4/ip_tunnel_core.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+ int pkt_len = skb->len;
+ struct iphdr *iph;
+ int err;
+
+ skb_scrub_packet(skb, xnet);
+
+ skb_clear_hash(skb);
+ skb_dst_set(skb, &rt->dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = dst;
+ iph->saddr = src;
+ iph->ttl = ttl;
+ __ip_select_ident(dev_net(rt->dst.dev), iph,
+ skb_shinfo(skb)->gso_segs ?: 1);
+
+ err = ip_local_out_sk(sk, skb);
+ if (unlikely(net_xmit_eval(err)))
+ pkt_len = 0;
+ return pkt_len;
+}
+EXPORT_SYMBOL_GPL(iptunnel_xmit);
+
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+{
+ if (unlikely(!pskb_may_pull(skb, hdr_len)))
+ return -ENOMEM;
+
+ skb_pull_rcsum(skb, hdr_len);
+
+ if (inner_proto == htons(ETH_P_TEB)) {
+ struct ethhdr *eh;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ return -ENOMEM;
+
+ eh = (struct ethhdr *)skb->data;
+ if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ skb->protocol = eh->h_proto;
+ else
+ skb->protocol = htons(ETH_P_802_2);
+
+ } else {
+ skb->protocol = inner_proto;
+ }
+
+ nf_reset(skb);
+ secpath_reset(skb);
+ skb_clear_hash_if_not_l4(skb);
+ skb_dst_drop(skb);
+ skb->vlan_tci = 0;
+ skb_set_queue_mapping(skb, 0);
+ skb->pkt_type = PACKET_HOST;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
+ bool csum_help,
+ int gso_type_mask)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= gso_type_mask;
+ return skb;
+ }
+
+ /* If packet is not gso and we are resolving any partial checksum,
+ * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
+ * on the outer header without confusing devices that implement
+ * NETIF_F_IP_CSUM with encapsulation.
+ */
+ if (csum_help)
+ skb->encapsulation = 0;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
+
+/* Often modified stats are per cpu, other are shared (netdev->stats) */
+struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *tstats =
+ per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_frame_errors = dev->stats.rx_frame_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000..0c152087c
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,599 @@
+/*
+ * Linux NET3: IP/IP protocol decoder modified to support
+ * virtual tunnel interface
+ *
+ * Authors:
+ * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/*
+ This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+#include <linux/icmpv6.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static int vti_net_id __read_mostly;
+static int vti_tunnel_init(struct net_device *dev);
+
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+ }
+
+ return -EINVAL;
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int vti_rcv(struct sk_buff *skb)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
+}
+
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+ u32 orig_mark = skb->mark;
+ int ret;
+
+ if (!tunnel)
+ return 1;
+
+ dev = tunnel->dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ x = xfrm_input_state(skb);
+ family = x->inner_mode->afinfo->family;
+
+ skb->mark = be32_to_cpu(tunnel->parms.i_key);
+ ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+ skb->mark = orig_mark;
+
+ if (!ret)
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+ skb->dev = dev;
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)&src;
+
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET)
+ return false;
+
+ if (!dst)
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+ return false;
+
+ return true;
+}
+
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct flowi *fl)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parms = &tunnel->parms;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *tdev; /* Device to other host */
+ int err;
+
+ if (!dst) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+
+ dst_hold(dst);
+ dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+
+ if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
+ dev->stats.tx_carrier_errors++;
+ dst_release(dst);
+ goto tx_error_icmp;
+ }
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ dst_release(dst);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = skb_dst(skb)->dev;
+
+ err = dst_output(skb);
+ if (net_xmit_eval(err) == 0)
+ err = skb->len;
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ default:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* override mark with tunnel output key */
+ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
+
+ return vti_xmit(skb, dev, &fl);
+}
+
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ int protocol = iph->protocol;
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!tunnel)
+ return -1;
+
+ mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int
+vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5)
+ return -EINVAL;
+ }
+
+ if (!(p.i_flags & GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags & GRE_KEY))
+ p.o_key = 0;
+
+ p.i_flags = VTI_ISVTI;
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (cmd != SIOCDELTUNNEL) {
+ p.i_flags |= GRE_KEY;
+ p.o_flags |= GRE_KEY;
+ }
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+ return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+ .ndo_init = vti_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = vti_tunnel_xmit,
+ .ndo_do_ioctl = vti_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti_netdev_ops;
+ dev->type = ARPHRD_TUNNEL;
+ ip_tunnel_setup(dev, vti_net_id);
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
+ dev->mtu = ETH_DATA_LEN;
+ dev->flags = IFF_NOARP;
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_LLTX;
+ netif_keep_dst(dev);
+
+ return ip_tunnel_init(dev);
+}
+
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+}
+
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static int __net_init vti_init_net(struct net *net)
+{
+ int err;
+ struct ip_tunnel_net *itn;
+
+ err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
+ if (err)
+ return err;
+ itn = net_generic(net, vti_net_id);
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
+ return 0;
+}
+
+static void __net_exit vti_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ ip_tunnel_delete_net(itn, &vti_link_ops);
+}
+
+static struct pernet_operations vti_net_ops = {
+ .init = vti_init_net,
+ .exit = vti_exit_net,
+ .id = &vti_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_IPIP;
+
+ if (!data)
+ return;
+
+ parms->i_flags = VTI_ISVTI;
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
+
+}
+
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm parms;
+
+ vti_netlink_parms(data, &parms);
+ return ip_tunnel_newlink(dev, tb, &parms);
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+
+ vti_netlink_parms(data, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ nla_put_u32(skb, IFLA_VTI_LINK, p->link);
+ nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
+ nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
+ nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+ nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+
+ return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+ .kind = "vti",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = vti_tunnel_setup,
+ .validate = vti_tunnel_validate,
+ .newlink = vti_newlink,
+ .changelink = vti_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = vti_get_size,
+ .fill_info = vti_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static int __init vti_init(void)
+{
+ const char *msg;
+ int err;
+
+ pr_info("IPv4 over IPsec tunneling driver\n");
+
+ msg = "tunnel device";
+ err = register_pernet_device(&vti_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "tunnel protocols";
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&vti_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return err;
+
+rtnl_link_failed:
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+ pr_err("vti init: failed to register %s\n", msg);
+ return err;
+}
+
+static void __exit vti_fini(void)
+{
+ rtnl_link_unregister(&vti_link_ops);
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 000000000..d97f4f278
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,204 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ * - Tunable compression parameters.
+ * - Compression stats.
+ * - Adaptive compression.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ __be32 spi;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ spi = htonl(ntohs(ipch->cpi));
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ struct xfrm_state *t;
+
+ t = xfrm_state_alloc(net);
+ if (!t)
+ goto out;
+
+ t->id.proto = IPPROTO_IPIP;
+ t->id.spi = x->props.saddr.a4;
+ t->id.daddr.a4 = x->id.daddr.a4;
+ memcpy(&t->sel, &x->sel, sizeof(t->sel));
+ t->props.family = AF_INET;
+ t->props.mode = x->props.mode;
+ t->props.saddr.a4 = x->props.saddr.a4;
+ t->props.flags = x->props.flags;
+ t->props.extra_flags = x->props.extra_flags;
+ memcpy(&t->mark, &x->mark, sizeof(t->mark));
+
+ if (xfrm_init_state(t))
+ goto error;
+
+ atomic_set(&t->tunnel_users, 1);
+out:
+ return t;
+
+error:
+ t->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(t);
+ t = NULL;
+ goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_mutex. State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ int err = 0;
+ struct xfrm_state *t;
+ u32 mark = x->mark.v & x->mark.m;
+
+ t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
+ x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+ if (!t) {
+ t = ipcomp_tunnel_create(x);
+ if (!t) {
+ err = -EINVAL;
+ goto out;
+ }
+ xfrm_state_insert(t);
+ xfrm_state_hold(t);
+ }
+ x->tunnel = t;
+ atomic_inc(&t->tunnel_users);
+out:
+ return err;
+}
+
+static int ipcomp4_init_state(struct xfrm_state *x)
+{
+ int err = -EINVAL;
+
+ x->props.header_len = 0;
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
+ x->props.header_len += sizeof(struct iphdr);
+ break;
+ default:
+ goto out;
+ }
+
+ err = ipcomp_init_state(x);
+ if (err)
+ goto out;
+
+ if (x->props.mode == XFRM_MODE_TUNNEL) {
+ err = ipcomp_tunnel_attach(x);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type ipcomp_type = {
+ .description = "IPCOMP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_COMP,
+ .init_state = ipcomp4_init_state,
+ .destructor = ipcomp_destroy,
+ .input = ipcomp_input,
+ .output = ipcomp_output
+};
+
+static struct xfrm4_protocol ipcomp4_protocol = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ipcomp4_rcv_cb,
+ .err_handler = ipcomp4_err,
+ .priority = 0,
+};
+
+static int __init ipcomp4_init(void)
+{
+ if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+ if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 000000000..8e7328c6a
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1693 @@
+/*
+ * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ * user-supplied information to configure own IP address and routes.
+ *
+ * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Derived from network configuration code in fs/nfs/nfsroot.c,
+ * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ * BOOTP rewritten to construct and analyse packets itself instead
+ * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ * -- MJ, December 1998
+ *
+ * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ * initialization scheme.
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ * DHCP support added. To users this looks like a whole separate
+ * protocol, but we know it's just a bag on the side of BOOTP.
+ * -- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ * Ported DHCP support from 2.2.16 to 2.4.0-test4
+ * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ * Merged changes from 2.2.19 into 2.4.3
+ * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ * Multiple Nameservers in /proc/net/pnp
+ * -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+#include <net/route.h>
+
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
+#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
+#define CONF_SEND_RETRIES 6 /* Send six requests per open */
+#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
+ - '3' from resolv.h */
+
+#define NONE cpu_to_be32(INADDR_NONE)
+#define ANY cpu_to_be32(INADDR_ANY)
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars. If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
+
+static int ic_enable __initdata; /* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+ | IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+ | IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+ | IC_RARP
+#endif
+ ;
+
+static int ic_host_name_set __initdata; /* Host name set by us? */
+
+__be32 ic_myaddr = NONE; /* My IP address */
+static __be32 ic_netmask = NONE; /* Netmask for local subnet */
+__be32 ic_gateway = NONE; /* Gateway IP address */
+
+__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+
+__be32 ic_servaddr = NONE; /* Boot server IP address */
+
+__be32 root_server_addr = NONE; /* Address of NFS server */
+u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+
+/* vendor class identifier */
+static char vendor_class_identifier[253] __initdata;
+
+/* Persistent data: */
+
+static int ic_proto_used; /* Protocol used, if any */
+static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64]; /* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata;
+
+/* MTU for boot device */
+static int ic_dev_mtu __initdata;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata; /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */
+#endif
+
+
+/*
+ * Network devices
+ */
+
+struct ic_device {
+ struct ic_device *next;
+ struct net_device *dev;
+ unsigned short flags;
+ short able;
+ __be32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata; /* List of open device */
+static struct net_device *ic_dev __initdata; /* Selected device */
+
+static bool __init ic_is_init_dev(struct net_device *dev)
+{
+ if (dev->flags & IFF_LOOPBACK)
+ return false;
+ return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5));
+}
+
+static int __init ic_open_devs(void)
+{
+ struct ic_device *d, **last;
+ struct net_device *dev;
+ unsigned short oflags;
+ unsigned long start, next_msg;
+
+ last = &ic_first_dev;
+ rtnl_lock();
+
+ /* bring loopback and DSA master network devices up first */
+ for_each_netdev(&init_net, dev) {
+ if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
+ continue;
+ if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ pr_err("IP-Config: Failed to open %s\n", dev->name);
+ }
+
+ for_each_netdev(&init_net, dev) {
+ if (ic_is_init_dev(dev)) {
+ int able = 0;
+ if (dev->mtu >= 364)
+ able |= IC_BOOTP;
+ else
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+ dev->name, dev->mtu);
+ if (!(dev->flags & IFF_NOARP))
+ able |= IC_RARP;
+ able &= ic_proto_enabled;
+ if (ic_proto_enabled && !able)
+ continue;
+ oflags = dev->flags;
+ if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ pr_err("IP-Config: Failed to open %s\n",
+ dev->name);
+ continue;
+ }
+ if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+ rtnl_unlock();
+ return -ENOMEM;
+ }
+ d->dev = dev;
+ *last = d;
+ last = &d->next;
+ d->flags = oflags;
+ d->able = able;
+ if (able & IC_BOOTP)
+ get_random_bytes(&d->xid, sizeof(__be32));
+ else
+ d->xid = 0;
+ ic_proto_have_if |= able;
+ DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid));
+ }
+ }
+
+ /* no point in waiting if we could not bring up at least one device */
+ if (!ic_first_dev)
+ goto have_carrier;
+
+ /* wait for a carrier on at least one device */
+ start = jiffies;
+ next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ while (time_before(jiffies, start +
+ msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) {
+ int wait, elapsed;
+
+ for_each_netdev(&init_net, dev)
+ if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+ goto have_carrier;
+
+ msleep(1);
+
+ if (time_before(jiffies, next_msg))
+ continue;
+
+ elapsed = jiffies_to_msecs(jiffies - start);
+ wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ pr_info("Waiting up to %d more seconds for network.\n", wait);
+ next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ }
+have_carrier:
+ rtnl_unlock();
+
+ *last = NULL;
+
+ if (!ic_first_dev) {
+ if (user_dev_name[0])
+ pr_err("IP-Config: Device `%s' not found\n",
+ user_dev_name);
+ else
+ pr_err("IP-Config: No network devices available\n");
+ return -ENODEV;
+ }
+ return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+ struct ic_device *d, *next;
+ struct net_device *dev;
+
+ rtnl_lock();
+ next = ic_first_dev;
+ while ((d = next)) {
+ next = d->next;
+ dev = d->dev;
+ if (dev != ic_dev && !netdev_uses_dsa(dev)) {
+ DBG(("IP-Config: Downing %s\n", dev->name));
+ dev_change_flags(dev, d->flags);
+ }
+ kfree(d);
+ }
+ rtnl_unlock();
+}
+
+/*
+ * Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+{
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = addr;
+ sin->sin_port = port;
+}
+
+static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+/*
+ * Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+ struct ifreq ir;
+ struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+ int err;
+
+ memset(&ir, 0, sizeof(ir));
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ set_sockaddr(sin, ic_myaddr, 0);
+ if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface address (%d)\n",
+ err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_netmask, 0);
+ if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+ err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+ if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+ err);
+ return -1;
+ }
+ /* Handle the case where we need non-standard MTU on the boot link (a network
+ * using jumbo frames, for instance). If we can't set the mtu, don't error
+ * out, we'll try to muddle along.
+ */
+ if (ic_dev_mtu != 0) {
+ strcpy(ir.ifr_name, ic_dev->name);
+ ir.ifr_mtu = ic_dev_mtu;
+ if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
+ pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+ ic_dev_mtu, err);
+ }
+ return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+ /* No need to setup device routes, only the default route... */
+
+ if (ic_gateway != NONE) {
+ struct rtentry rm;
+ int err;
+
+ memset(&rm, 0, sizeof(rm));
+ if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+ pr_err("IP-Config: Gateway not on directly connected network\n");
+ return -1;
+ }
+ set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+ rm.rt_flags = RTF_UP | RTF_GATEWAY;
+ if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+ pr_err("IP-Config: Cannot add default route (%d)\n",
+ err);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+ /*
+ * At this point we have no userspace running so need not
+ * claim locks on system_utsname
+ */
+
+ if (!ic_host_name_set)
+ sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr);
+
+ if (root_server_addr == NONE)
+ root_server_addr = ic_servaddr;
+
+ if (ic_netmask == NONE) {
+ if (IN_CLASSA(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSA_NET);
+ else if (IN_CLASSB(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSB_NET);
+ else if (IN_CLASSC(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSC_NET);
+ else {
+ pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+ &ic_myaddr);
+ return -1;
+ }
+ printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
+ }
+
+ return 0;
+}
+
+/*
+ * RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type rarp_packet_type __initdata = {
+ .type = cpu_to_be16(ETH_P_RARP),
+ .func = ic_rarp_recv,
+};
+
+static inline void __init ic_rarp_init(void)
+{
+ dev_add_pack(&rarp_packet_type);
+}
+
+static inline void __init ic_rarp_cleanup(void)
+{
+ dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ * Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct arphdr *rarp;
+ unsigned char *rarp_ptr;
+ __be32 sip, tip;
+ unsigned char *tha; /* t for "target" */
+ struct ic_device *d;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+ goto drop;
+
+ /* Basic sanity checks can be done without the lock. */
+ rarp = (struct arphdr *)skb_transport_header(skb);
+
+ /* If this test doesn't pass, it's not IP, or we should
+ * ignore it anyway.
+ */
+ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+ goto drop;
+
+ /* If it's not a RARP reply, delete it. */
+ if (rarp->ar_op != htons(ARPOP_RREPLY))
+ goto drop;
+
+ /* If it's not Ethernet, delete it. */
+ if (rarp->ar_pro != htons(ETH_P_IP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ goto drop;
+
+ /* OK, it is all there and looks valid, process... */
+ rarp = (struct arphdr *)skb_transport_header(skb);
+ rarp_ptr = (unsigned char *) (rarp + 1);
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Extract variable-width fields */
+ rarp_ptr += dev->addr_len;
+ memcpy(&sip, rarp_ptr, 4);
+ rarp_ptr += 4;
+ tha = rarp_ptr;
+ rarp_ptr += dev->addr_len;
+ memcpy(&tip, rarp_ptr, 4);
+
+ /* Discard packets which are not meant for us. */
+ if (memcmp(tha, dev->dev_addr, dev->addr_len))
+ goto drop_unlock;
+
+ /* Discard packets which are not from specified server. */
+ if (ic_servaddr != NONE && ic_servaddr != sip)
+ goto drop_unlock;
+
+ /* We have a winner! */
+ ic_dev = dev;
+ if (ic_myaddr == NONE)
+ ic_myaddr = tip;
+ ic_servaddr = sip;
+ ic_addrservaddr = sip;
+ ic_got_reply = IC_RARP;
+
+drop_unlock:
+ /* Show's over. Nothing to see here. */
+ spin_unlock(&ic_recv_lock);
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+ struct net_device *dev = d->dev;
+ arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+ dev->dev_addr, dev->dev_addr);
+}
+#endif
+
+/*
+ * Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ ic_nameservers[i] = NONE;
+}
+
+/*
+ * DHCP/BOOTP support.
+ */
+
+#ifdef IPCONFIG_BOOTP
+
+struct bootp_pkt { /* BOOTP packet format */
+ struct iphdr iph; /* IP header */
+ struct udphdr udph; /* UDP header */
+ u8 op; /* 1=request, 2=reply */
+ u8 htype; /* HW address type */
+ u8 hlen; /* HW address length */
+ u8 hops; /* Used only by gateways */
+ __be32 xid; /* Transaction ID */
+ __be16 secs; /* Seconds since we started */
+ __be16 flags; /* Just what it says */
+ __be32 client_ip; /* Client's IP address if known */
+ __be32 your_ip; /* Assigned IP address */
+ __be32 server_ip; /* (Next, e.g. NFS) Server's IP address */
+ __be32 relay_ip; /* IP address of BOOTP relay */
+ u8 hw_addr[16]; /* Client's HW address */
+ u8 serv_name[64]; /* Server host name */
+ u8 boot_file[128]; /* Name of boot file */
+ u8 exten[312]; /* DHCP options / BOOTP vendor extensions */
+};
+
+/* packet ops */
+#define BOOTP_REQUEST 1
+#define BOOTP_REPLY 2
+
+/* DHCP message types */
+#define DHCPDISCOVER 1
+#define DHCPOFFER 2
+#define DHCPREQUEST 3
+#define DHCPDECLINE 4
+#define DHCPACK 5
+#define DHCPNAK 6
+#define DHCPRELEASE 7
+#define DHCPINFORM 8
+
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type bootp_packet_type __initdata = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .func = ic_bootp_recv,
+};
+
+static __be32 ic_dev_xid; /* Device under configuration */
+
+/*
+ * Initialize DHCP/BOOTP extension fields in the request.
+ */
+
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+
+#ifdef IPCONFIG_DHCP
+
+static void __init
+ic_dhcp_init_options(u8 *options)
+{
+ u8 mt = ((ic_servaddr == NONE)
+ ? DHCPDISCOVER : DHCPREQUEST);
+ u8 *e = options;
+ int len;
+
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Sending message type %d\n", mt);
+#endif
+
+ memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
+ e += 4;
+
+ *e++ = 53; /* DHCP message type */
+ *e++ = 1;
+ *e++ = mt;
+
+ if (mt == DHCPREQUEST) {
+ *e++ = 54; /* Server ID (IP address) */
+ *e++ = 4;
+ memcpy(e, &ic_servaddr, 4);
+ e += 4;
+
+ *e++ = 50; /* Requested IP address */
+ *e++ = 4;
+ memcpy(e, &ic_myaddr, 4);
+ e += 4;
+ }
+
+ /* always? */
+ {
+ static const u8 ic_req_params[] = {
+ 1, /* Subnet mask */
+ 3, /* Default gateway */
+ 6, /* DNS server */
+ 12, /* Host name */
+ 15, /* Domain name */
+ 17, /* Boot path */
+ 26, /* MTU */
+ 40, /* NIS domain name */
+ };
+
+ *e++ = 55; /* Parameter request list */
+ *e++ = sizeof(ic_req_params);
+ memcpy(e, ic_req_params, sizeof(ic_req_params));
+ e += sizeof(ic_req_params);
+
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
+ if (*vendor_class_identifier) {
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
+ }
+
+ *e++ = 255; /* End of the list */
+}
+
+#endif /* IPCONFIG_DHCP */
+
+static void __init ic_bootp_init_ext(u8 *e)
+{
+ memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
+ e += 4;
+ *e++ = 1; /* Subnet mask request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 3; /* Default gateway request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 5; /* Name server request */
+ *e++ = 8;
+ e += 8;
+ *e++ = 12; /* Host name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 40; /* NIS Domain name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 17; /* Boot path */
+ *e++ = 40;
+ e += 40;
+
+ *e++ = 57; /* set extension buffer size for reply */
+ *e++ = 2;
+ *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */
+ *e++ = 150;
+
+ *e++ = 255; /* End of the list */
+}
+
+
+/*
+ * Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void __init ic_bootp_init(void)
+{
+ ic_nameservers_predef();
+
+ dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ * DHCP/BOOTP cleanup.
+ */
+static inline void __init ic_bootp_cleanup(void)
+{
+ dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ * Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+ struct net_device *dev = d->dev;
+ struct sk_buff *skb;
+ struct bootp_pkt *b;
+ struct iphdr *h;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+
+ /* Allocate packet */
+ skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
+ GFP_KERNEL);
+ if (!skb)
+ return;
+ skb_reserve(skb, hlen);
+ b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+ memset(b, 0, sizeof(struct bootp_pkt));
+
+ /* Construct IP header */
+ skb_reset_network_header(skb);
+ h = ip_hdr(skb);
+ h->version = 4;
+ h->ihl = 5;
+ h->tot_len = htons(sizeof(struct bootp_pkt));
+ h->frag_off = htons(IP_DF);
+ h->ttl = 64;
+ h->protocol = IPPROTO_UDP;
+ h->daddr = htonl(INADDR_BROADCAST);
+ h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+ /* Construct UDP header */
+ b->udph.source = htons(68);
+ b->udph.dest = htons(67);
+ b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+ /* Construct DHCP/BOOTP header */
+ b->op = BOOTP_REQUEST;
+ if (dev->type < 256) /* check for false types */
+ b->htype = dev->type;
+ else if (dev->type == ARPHRD_FDDI)
+ b->htype = ARPHRD_ETHER;
+ else {
+ printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ b->htype = dev->type; /* can cause undefined behavior */
+ }
+
+ /* server_ip and your_ip address are both already zero per RFC2131 */
+ b->hlen = dev->addr_len;
+ memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+ b->secs = htons(jiffies_diff / HZ);
+ b->xid = d->xid;
+
+ /* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+ if (ic_proto_enabled & IC_USE_DHCP)
+ ic_dhcp_init_options(b->exten);
+ else
+#endif
+ ic_bootp_init_ext(b->exten);
+
+ /* Chain packet down the line... */
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+ if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+ dev->broadcast, dev->dev_addr, skb->len) < 0) {
+ kfree_skb(skb);
+ printk("E");
+ return;
+ }
+
+ if (dev_queue_xmit(skb) < 0)
+ printk("E");
+}
+
+
+/*
+ * Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+ if (!len)
+ return 0;
+ if (len > max-1)
+ len = max-1;
+ memcpy(dest, src, len);
+ dest[len] = '\0';
+ return 1;
+}
+
+
+/*
+ * Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+ u8 servers;
+ int i;
+ __be16 mtu;
+
+#ifdef IPCONFIG_DEBUG
+ u8 *c;
+
+ printk("DHCP/BOOTP: Got extension %d:",*ext);
+ for (c=ext+2; c<ext+2+ext[1]; c++)
+ printk(" %02x", *c);
+ printk("\n");
+#endif
+
+ switch (*ext++) {
+ case 1: /* Subnet mask */
+ if (ic_netmask == NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 6: /* DNS server */
+ servers= *ext/4;
+ if (servers > CONF_NAMESERVERS_MAX)
+ servers = CONF_NAMESERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_nameservers[i] == NONE)
+ memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+ }
+ break;
+ case 12: /* Host name */
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ break;
+ case 15: /* Domain name (DNS) */
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext,
+ sizeof(root_server_path));
+ break;
+ case 26: /* Interface MTU */
+ memcpy(&mtu, ext+1, sizeof(mtu));
+ ic_dev_mtu = ntohs(mtu);
+ break;
+ case 40: /* NIS Domain name (_not_ DNS) */
+ ic_bootp_string(utsname()->domainname, ext+1, *ext,
+ __NEW_UTS_LEN);
+ break;
+ }
+}
+
+
+/*
+ * Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct bootp_pkt *b;
+ struct iphdr *h;
+ struct ic_device *d;
+ int len, ext_len;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ /* Perform verifications before taking the lock. */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ goto drop;
+
+ b = (struct bootp_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+ goto drop;
+
+ /* Fragments are not supported */
+ if (ip_is_fragment(h)) {
+ net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
+ goto drop;
+ }
+
+ if (skb->len < ntohs(h->tot_len))
+ goto drop;
+
+ if (ip_fast_csum((char *) h, h->ihl))
+ goto drop;
+
+ if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+ goto drop;
+
+ if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+ goto drop;
+
+ len = ntohs(b->udph.len) - sizeof(struct udphdr);
+ ext_len = len - (sizeof(*b) -
+ sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ sizeof(b->exten));
+ if (ext_len < 0)
+ goto drop;
+
+ /* Ok the front looks good, make sure we can get at the rest. */
+ if (!pskb_may_pull(skb, skb->len))
+ goto drop;
+
+ b = (struct bootp_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Is it a reply to our BOOTP request? */
+ if (b->op != BOOTP_REPLY ||
+ b->xid != d->xid) {
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
+ b->op, b->xid);
+ goto drop_unlock;
+ }
+
+ /* Is it a reply for the device we are configuring? */
+ if (b->xid != ic_dev_xid) {
+ net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
+ goto drop_unlock;
+ }
+
+ /* Parse extensions */
+ if (ext_len >= 4 &&
+ !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+ u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+ u8 *ext;
+
+#ifdef IPCONFIG_DHCP
+ if (ic_proto_enabled & IC_USE_DHCP) {
+ __be32 server_id = NONE;
+ int mt = 0;
+
+ ext = &b->exten[4];
+ while (ext < end && *ext != 0xff) {
+ u8 *opt = ext++;
+ if (*opt == 0) /* Padding */
+ continue;
+ ext += *ext + 1;
+ if (ext >= end)
+ break;
+ switch (*opt) {
+ case 53: /* Message type */
+ if (opt[1])
+ mt = opt[2];
+ break;
+ case 54: /* Server ID (IP address) */
+ if (opt[1] >= 4)
+ memcpy(&server_id, opt + 2, 4);
+ break;
+ }
+ }
+
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Got message type %d\n", mt);
+#endif
+
+ switch (mt) {
+ case DHCPOFFER:
+ /* While in the process of accepting one offer,
+ * ignore all others.
+ */
+ if (ic_myaddr != NONE)
+ goto drop_unlock;
+
+ /* Let's accept that offer. */
+ ic_myaddr = b->your_ip;
+ ic_servaddr = server_id;
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Offered address %pI4 by server %pI4\n",
+ &ic_myaddr, &b->iph.saddr);
+#endif
+ /* The DHCP indicated server address takes
+ * precedence over the bootp header one if
+ * they are different.
+ */
+ if ((server_id != NONE) &&
+ (b->server_ip != server_id))
+ b->server_ip = ic_servaddr;
+ break;
+
+ case DHCPACK:
+ if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+ goto drop_unlock;
+
+ /* Yeah! */
+ break;
+
+ default:
+ /* Urque. Forget it*/
+ ic_myaddr = NONE;
+ ic_servaddr = NONE;
+ goto drop_unlock;
+ }
+
+ ic_dhcp_msgtype = mt;
+
+ }
+#endif /* IPCONFIG_DHCP */
+
+ ext = &b->exten[4];
+ while (ext < end && *ext != 0xff) {
+ u8 *opt = ext++;
+ if (*opt == 0) /* Padding */
+ continue;
+ ext += *ext + 1;
+ if (ext < end)
+ ic_do_bootp_ext(opt);
+ }
+ }
+
+ /* We have a winner! */
+ ic_dev = dev;
+ ic_myaddr = b->your_ip;
+ ic_servaddr = b->server_ip;
+ ic_addrservaddr = b->iph.saddr;
+ if (ic_gateway == NONE && b->relay_ip)
+ ic_gateway = b->relay_ip;
+ if (ic_nameservers[0] == NONE)
+ ic_nameservers[0] = ic_servaddr;
+ ic_got_reply = IC_BOOTP;
+
+drop_unlock:
+ /* Show's over. Nothing to see here. */
+ spin_unlock(&ic_recv_lock);
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+
+ return 0;
+}
+
+
+#endif
+
+
+/*
+ * Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+
+#ifdef IPCONFIG_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+ int retries;
+ struct ic_device *d;
+ unsigned long start_jiffies, timeout, jiff;
+ int do_bootp = ic_proto_have_if & IC_BOOTP;
+ int do_rarp = ic_proto_have_if & IC_RARP;
+
+ /*
+ * If none of DHCP/BOOTP/RARP was selected, return with an error.
+ * This routine gets only called when some pieces of information
+ * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+ */
+ if (!ic_proto_enabled) {
+ pr_err("IP-Config: Incomplete network configuration information\n");
+ return -1;
+ }
+
+#ifdef IPCONFIG_BOOTP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+ pr_err("DHCP/BOOTP: No suitable device found\n");
+#endif
+#ifdef IPCONFIG_RARP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+ pr_err("RARP: No suitable device found\n");
+#endif
+
+ if (!ic_proto_have_if)
+ /* Error message already printed */
+ return -1;
+
+ /*
+ * Setup protocols
+ */
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp)
+ ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp)
+ ic_rarp_init();
+#endif
+
+ /*
+ * Send requests and wait, until we get an answer. This loop
+ * seems to be a terrible waste of CPU time, but actually there is
+ * only one process running at all, so we don't need to use any
+ * scheduler functions.
+ * [Actually we could now, but the nothing else running note still
+ * applies.. - AC]
+ */
+ pr_notice("Sending %s%s%s requests .",
+ do_bootp
+ ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+ (do_bootp && do_rarp) ? " and " : "",
+ do_rarp ? "RARP" : "");
+
+ start_jiffies = jiffies;
+ d = ic_first_dev;
+ retries = CONF_SEND_RETRIES;
+ get_random_bytes(&timeout, sizeof(timeout));
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
+ for (;;) {
+#ifdef IPCONFIG_BOOTP
+ /* Track the device we are configuring */
+ ic_dev_xid = d->xid;
+
+ if (do_bootp && (d->able & IC_BOOTP))
+ ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp && (d->able & IC_RARP))
+ ic_rarp_send_if(d);
+#endif
+
+ jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
+ while (time_before(jiffies, jiff) && !ic_got_reply)
+ schedule_timeout_uninterruptible(1);
+#ifdef IPCONFIG_DHCP
+ /* DHCP isn't done until we get a DHCPACK. */
+ if ((ic_got_reply & IC_BOOTP) &&
+ (ic_proto_enabled & IC_USE_DHCP) &&
+ ic_dhcp_msgtype != DHCPACK) {
+ ic_got_reply = 0;
+ pr_cont(",");
+ continue;
+ }
+#endif /* IPCONFIG_DHCP */
+
+ if (ic_got_reply) {
+ pr_cont(" OK\n");
+ break;
+ }
+
+ if ((d = d->next))
+ continue;
+
+ if (! --retries) {
+ pr_cont(" timed out!\n");
+ break;
+ }
+
+ d = ic_first_dev;
+
+ timeout = timeout CONF_TIMEOUT_MULT;
+ if (timeout > CONF_TIMEOUT_MAX)
+ timeout = CONF_TIMEOUT_MAX;
+
+ pr_cont(".");
+ }
+
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp)
+ ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp)
+ ic_rarp_cleanup();
+#endif
+
+ if (!ic_got_reply) {
+ ic_myaddr = NONE;
+ return -1;
+ }
+
+ printk("IP-Config: Got %s answer from %pI4, ",
+ ((ic_got_reply & IC_RARP) ? "RARP"
+ : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+ &ic_addrservaddr);
+ pr_cont("my address is %pI4\n", &ic_myaddr);
+
+ return 0;
+}
+
+#endif /* IPCONFIG_DYNAMIC */
+
+#ifdef CONFIG_PROC_FS
+
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ if (ic_proto_used & IC_PROTO)
+ seq_printf(seq, "#PROTO: %s\n",
+ (ic_proto_used & IC_RARP) ? "RARP"
+ : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+ else
+ seq_puts(seq, "#MANUAL\n");
+
+ if (ic_domain[0])
+ seq_printf(seq,
+ "domain %s\n", ic_domain);
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+ if (ic_nameservers[i] != NONE)
+ seq_printf(seq, "nameserver %pI4\n",
+ &ic_nameservers[i]);
+ }
+ if (ic_servaddr != NONE)
+ seq_printf(seq, "bootserver %pI4\n",
+ &ic_servaddr);
+ return 0;
+}
+
+static int pnp_seq_open(struct inode *indoe, struct file *file)
+{
+ return single_open(file, pnp_seq_show, NULL);
+}
+
+static const struct file_operations pnp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = pnp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Extract IP address from the parameter string if needed. Note that we
+ * need to have root_server_addr set _before_ IPConfig gets called as it
+ * can override it.
+ */
+__be32 __init root_nfs_parse_addr(char *name)
+{
+ __be32 addr;
+ int octets = 0;
+ char *cp, *cq;
+
+ cp = cq = name;
+ while (octets < 4) {
+ while (*cp >= '0' && *cp <= '9')
+ cp++;
+ if (cp == cq || cp - cq > 3)
+ break;
+ if (*cp == '.' || octets == 3)
+ octets++;
+ if (octets < 4)
+ cp++;
+ cq = cp;
+ }
+ if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+ if (*cp == ':')
+ *cp++ = '\0';
+ addr = in_aton(name);
+ memmove(name, cp, strlen(cp) + 1);
+ } else
+ addr = NONE;
+
+ return addr;
+}
+
+#define DEVICE_WAIT_MAX 12 /* 12 seconds */
+
+static int __init wait_for_devices(void)
+{
+ int i;
+
+ for (i = 0; i < DEVICE_WAIT_MAX; i++) {
+ struct net_device *dev;
+ int found = 0;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ if (ic_is_init_dev(dev)) {
+ found = 1;
+ break;
+ }
+ }
+ rtnl_unlock();
+ if (found)
+ return 0;
+ ssleep(1);
+ }
+ return -ENODEV;
+}
+
+/*
+ * IP Autoconfig dispatcher.
+ */
+
+static int __init ip_auto_config(void)
+{
+ __be32 addr;
+#ifdef IPCONFIG_DYNAMIC
+ int retries = CONF_OPEN_RETRIES;
+#endif
+ int err;
+ unsigned int i;
+
+#ifdef CONFIG_PROC_FS
+ proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);
+#endif /* CONFIG_PROC_FS */
+
+ if (!ic_enable)
+ return 0;
+
+ DBG(("IP-Config: Entered.\n"));
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+ /* Wait for devices to appear */
+ err = wait_for_devices();
+ if (err)
+ return err;
+
+ /* Setup all network devices */
+ err = ic_open_devs();
+ if (err)
+ return err;
+
+ /* Give drivers a chance to settle */
+ msleep(CONF_POST_OPEN);
+
+ /*
+ * If the config information is insufficient (e.g., our IP address or
+ * IP address of the boot server is missing or we have multiple network
+ * interfaces and no default was set), use BOOTP or RARP to get the
+ * missing values.
+ */
+ if (ic_myaddr == NONE ||
+#ifdef CONFIG_ROOT_NFS
+ (root_server_addr == NONE &&
+ ic_servaddr == NONE &&
+ ROOT_DEV == Root_NFS) ||
+#endif
+ ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+ if (ic_dynamic() < 0) {
+ ic_close_devs();
+
+ /*
+ * I don't know why, but sometimes the
+ * eepro100 driver (at least) gets upset and
+ * doesn't work the first time it's opened.
+ * But then if you close it and reopen it, it
+ * works just fine. So we need to try that at
+ * least once before giving up.
+ *
+ * Also, if the root will be NFS-mounted, we
+ * have nowhere to go if DHCP fails. So we
+ * just have to keep trying forever.
+ *
+ * -- Chip
+ */
+#ifdef CONFIG_ROOT_NFS
+ if (ROOT_DEV == Root_NFS) {
+ pr_err("IP-Config: Retrying forever (NFS root)...\n");
+ goto try_try_again;
+ }
+#endif
+
+ if (--retries) {
+ pr_err("IP-Config: Reopening network devices...\n");
+ goto try_try_again;
+ }
+
+ /* Oh, well. At least we tried. */
+ pr_err("IP-Config: Auto-configuration of network failed\n");
+ return -1;
+ }
+#else /* !DYNAMIC */
+ pr_err("IP-Config: Incomplete network configuration information\n");
+ ic_close_devs();
+ return -1;
+#endif /* IPCONFIG_DYNAMIC */
+ } else {
+ /* Device selected manually or only one device -> use it */
+ ic_dev = ic_first_dev->dev;
+ }
+
+ addr = root_nfs_parse_addr(root_server_path);
+ if (root_server_addr == NONE)
+ root_server_addr = addr;
+
+ /*
+ * Use defaults wherever applicable.
+ */
+ if (ic_defaults() < 0)
+ return -1;
+
+ /*
+ * Close all network devices except the device we've
+ * autoconfigured and set up routes.
+ */
+ ic_close_devs();
+ if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+ return -1;
+
+ /*
+ * Record which protocol was actually used.
+ */
+#ifdef IPCONFIG_DYNAMIC
+ ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+
+#ifndef IPCONFIG_SILENT
+ /*
+ * Clue in the operator.
+ */
+ pr_info("IP-Config: Complete:\n");
+
+ pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+ ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr,
+ &ic_myaddr, &ic_netmask, &ic_gateway);
+ pr_info(" host=%s, domain=%s, nis-domain=%s\n",
+ utsname()->nodename, ic_domain, utsname()->domainname);
+ pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+ &ic_servaddr, &root_server_addr, root_server_path);
+ if (ic_dev_mtu)
+ pr_cont(", mtu=%d", ic_dev_mtu);
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ if (ic_nameservers[i] != NONE) {
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ break;
+ }
+ for (i++; i < CONF_NAMESERVERS_MAX; i++)
+ if (ic_nameservers[i] != NONE)
+ pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
+ pr_cont("\n");
+#endif /* !SILENT */
+
+ return 0;
+}
+
+late_initcall(ip_auto_config);
+
+
+/*
+ * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
+ */
+static int __init ic_proto_name(char *name)
+{
+ if (!strcmp(name, "on") || !strcmp(name, "any")) {
+ return 1;
+ }
+ if (!strcmp(name, "off") || !strcmp(name, "none")) {
+ return 0;
+ }
+#ifdef CONFIG_IP_PNP_DHCP
+ else if (!strcmp(name, "dhcp")) {
+ ic_proto_enabled &= ~IC_RARP;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+ else if (!strcmp(name, "bootp")) {
+ ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+ else if (!strcmp(name, "rarp")) {
+ ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+ return 1;
+ }
+#endif
+#ifdef IPCONFIG_DYNAMIC
+ else if (!strcmp(name, "both")) {
+ ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static int __init ip_auto_config_setup(char *addrs)
+{
+ char *cp, *ip, *dp;
+ int num = 0;
+
+ ic_set_manually = 1;
+ ic_enable = 1;
+
+ /*
+ * If any dhcp, bootp etc options are set, leave autoconfig on
+ * and skip the below static IP processing.
+ */
+ if (ic_proto_name(addrs))
+ return 1;
+
+ /* If no static IP is given, turn off autoconfig and bail. */
+ if (*addrs == 0 ||
+ strcmp(addrs, "off") == 0 ||
+ strcmp(addrs, "none") == 0) {
+ ic_enable = 0;
+ return 1;
+ }
+
+ ic_nameservers_predef();
+
+ /* Parse string for static IP assignment. */
+ ip = addrs;
+ while (ip && *ip) {
+ if ((cp = strchr(ip, ':')))
+ *cp++ = '\0';
+ if (strlen(ip) > 0) {
+ DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ switch (num) {
+ case 0:
+ if ((ic_myaddr = in_aton(ip)) == ANY)
+ ic_myaddr = NONE;
+ break;
+ case 1:
+ if ((ic_servaddr = in_aton(ip)) == ANY)
+ ic_servaddr = NONE;
+ break;
+ case 2:
+ if ((ic_gateway = in_aton(ip)) == ANY)
+ ic_gateway = NONE;
+ break;
+ case 3:
+ if ((ic_netmask = in_aton(ip)) == ANY)
+ ic_netmask = NONE;
+ break;
+ case 4:
+ if ((dp = strchr(ip, '.'))) {
+ *dp++ = '\0';
+ strlcpy(utsname()->domainname, dp,
+ sizeof(utsname()->domainname));
+ }
+ strlcpy(utsname()->nodename, ip,
+ sizeof(utsname()->nodename));
+ ic_host_name_set = 1;
+ break;
+ case 5:
+ strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ break;
+ case 6:
+ if (ic_proto_name(ip) == 0 &&
+ ic_myaddr == NONE) {
+ ic_enable = 0;
+ }
+ break;
+ case 7:
+ if (CONF_NAMESERVERS_MAX >= 1) {
+ ic_nameservers[0] = in_aton(ip);
+ if (ic_nameservers[0] == ANY)
+ ic_nameservers[0] = NONE;
+ }
+ break;
+ case 8:
+ if (CONF_NAMESERVERS_MAX >= 2) {
+ ic_nameservers[1] = in_aton(ip);
+ if (ic_nameservers[1] == ANY)
+ ic_nameservers[1] = NONE;
+ }
+ break;
+ }
+ }
+ ip = cp;
+ num++;
+ }
+
+ return 1;
+}
+__setup("ip=", ip_auto_config_setup);
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+ return ip_auto_config_setup(addrs);
+}
+__setup("nfsaddrs=", nfsaddrs_config_setup);
+
+static int __init vendor_class_identifier_setup(char *addrs)
+{
+ if (strlcpy(vendor_class_identifier, addrs,
+ sizeof(vendor_class_identifier))
+ >= sizeof(vendor_class_identifier))
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+ vendor_class_identifier);
+ return 1;
+}
+__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 000000000..ff96396eb
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,569 @@
+/*
+ * Linux NET3: IP/IP protocol decoder.
+ *
+ * Authors:
+ * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+ *
+ * Fixes:
+ * Alan Cox : Merged and made usable non modular (its so tiny its silly as
+ * a module taking up 2 pages).
+ * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ * to keep ip_forward happy.
+ * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
+ * David Woodhouse : Perform some basic ICMP handling.
+ * IPIP Routing without decapsulation.
+ * Carlos Picoto : GRE over IP support
+ * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ * I do not want to merge them together.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+ The purpose of this driver is to provide an IP tunnel through
+ which you can tunnel network traffic transparently across subnets.
+
+ This was written by looking at Nick Holloway's dummy driver
+ Thanks for the great code!
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+
+ Minor tweaks:
+ Cleaned up the code a little and added some pre-1.3.0 tweaks.
+ dev->hard_header/hard_header_len changed to use no headers.
+ Comments/bracketing tweaked.
+ Made the tunnels use dev->name not tunnel: when error reporting.
+ Added tx_dropped stat
+
+ -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95
+
+ Reworked:
+ Changed to tunnel to destination gateway in addition to the
+ tunnel's pointopoint address
+ Almost completely rewritten
+ Note: There is currently no firewall or ICMP handling done.
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
+
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+ When the tunnel_xmit() function is called, the skb contains the
+ packet to be sent (plus a great deal of extra info), and dev
+ contains the tunnel device that _we_ are.
+
+ When we are passed a packet, we are expected to fill in the
+ source address with our source IP address.
+
+ What is the proper way to allocate, copy and free a buffer?
+ After you allocate it, it is a "0 length" chunk of memory
+ starting at zero. If you want to add headers to the buffer
+ later, you'll have to call "skb_reserve(skb, amount)" with
+ the amount of memory you want reserved. Then, you call
+ "skb_put(skb, amount)" with the amount of space you want in
+ the buffer. skb_put() returns a pointer to the top (#0) of
+ that buffer. skb->len is set to the amount of space you have
+ "allocated" with skb_put(). You can then write up to skb->len
+ bytes to that buffer. If you need more, you can call skb_put()
+ again with the additional amount of space you need. You can
+ find out how much more space you can allocate by calling
+ "skb_tailroom(skb)".
+ Now, to add header space, call "skb_push(skb, header_len)".
+ This creates space at the beginning of the buffer and returns
+ a pointer to this new space. If later you need to strip a
+ header from a buffer, call "skb_pull(skb, header_len)".
+ skb_headroom() will return how much space is left at the top
+ of the buffer (before the main data). Remember, this headroom
+ space must be reserved before the skb_put() function is called.
+ */
+
+/*
+ This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static int ipip_net_id __read_mostly;
+
+static int ipip_tunnel_init(struct net_device *dev);
+static struct rtnl_link_ops ipip_link_ops __read_mostly;
+
+static int ipip_err(struct sk_buff *skb, u32 info)
+{
+
+/* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_tunnel *t;
+ int err;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+
+ err = -ENOENT;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!t)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+ IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (t->parms.iph.daddr == 0)
+ goto out;
+
+ err = 0;
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+
+out:
+ return err;
+}
+
+static const struct tnl_ptk_info tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ if (iptunnel_pull_header(skb, 0, tpi.proto))
+ goto drop;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ }
+
+ return -1;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
+ goto tx_error;
+
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ if (IS_ERR(skb))
+ goto out;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
+ ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
+ return NETDEV_TX_OK;
+
+tx_error:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+}
+
+static int
+ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ return -EINVAL;
+ }
+
+ p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static const struct net_device_ops ipip_netdev_ops = {
+ .ndo_init = ipip_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = ipip_tunnel_xmit,
+ .ndo_do_ioctl = ipip_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+#define IPIP_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
+
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &ipip_netdev_ops;
+
+ dev->type = ARPHRD_TUNNEL;
+ dev->flags = IFF_NOARP;
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_LLTX;
+ netif_keep_dst(dev);
+
+ dev->features |= IPIP_FEATURES;
+ dev->hw_features |= IPIP_FEATURES;
+ ip_tunnel_setup(dev, ipip_net_id);
+}
+
+static int ipip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ tunnel->tun_hlen = 0;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ tunnel->parms.iph.protocol = IPPROTO_IPIP;
+ return ip_tunnel_init(dev);
+}
+
+static void ipip_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPIP;
+ parms->iph.ihl = 5;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
+
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+}
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipip_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+static int ipip_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipip_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipip_netlink_parms(data, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
+}
+
+static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipip_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipip_netlink_parms(data, &p);
+
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
+
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t ipip_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
+ 0;
+}
+
+static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipip_link_ops __read_mostly = {
+ .kind = "ipip",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip_tunnel_setup,
+ .newlink = ipip_newlink,
+ .changelink = ipip_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipip_get_size,
+ .fill_info = ipip_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+ .handler = ipip_rcv,
+ .err_handler = ipip_err,
+ .priority = 1,
+};
+
+static int __net_init ipip_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
+}
+
+static void __net_exit ipip_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ ip_tunnel_delete_net(itn, &ipip_link_ops);
+}
+
+static struct pernet_operations ipip_net_ops = {
+ .init = ipip_init_net,
+ .exit = ipip_exit_net,
+ .id = &ipip_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipip_init(void)
+{
+ int err;
+
+ pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
+
+ err = register_pernet_device(&ipip_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_failed;
+ }
+ err = rtnl_link_register(&ipip_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+out:
+ return err;
+
+rtnl_link_failed:
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel_failed:
+ unregister_pernet_device(&ipip_net_ops);
+ goto out;
+}
+
+static void __exit ipip_fini(void)
+{
+ rtnl_link_unregister(&ipip_link_ops);
+ if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
+ pr_info("%s: can't deregister tunnel\n", __func__);
+
+ unregister_pernet_device(&ipip_net_ops);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ipip");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 000000000..3a2c0162c
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,2792 @@
+/*
+ * IP multicast routing support for mrouted 3.6/3.8
+ *
+ * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ * Linux Consultancy and Custom Driver Development
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Michael Chastain : Incorrect size of copying.
+ * Alan Cox : Added the cache manager code
+ * Alan Cox : Fixed the clone/copy bug and device race.
+ * Mike McLagan : Routing by source
+ * Malcolm Beattie : Buffer handling fixes.
+ * Alexey Kuznetsov : Double buffer free and other fixes.
+ * SVR Anand : Fixed several multicast bugs and problems.
+ * Alexey Kuznetsov : Status, optimisations and more.
+ * Brad Parker : Better behaviour on mrouted upcall
+ * overflow.
+ * Carlos Picoto : PIMv1 Support
+ * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
+ * Relax this requirement to work with older peers.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <net/ip_tunnels.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+#include <linux/netconf.h>
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM 1
+#endif
+
+struct mr_table {
+ struct list_head list;
+ possible_net_t net;
+ u32 id;
+ struct sock __rcu *mroute_sk;
+ struct timer_list ipmr_expire_timer;
+ struct list_head mfc_unres_queue;
+ struct list_head mfc_cache_array[MFC_LINES];
+ struct vif_device vif_table[MAXVIFS];
+ int maxvif;
+ atomic_t cache_resolve_queue_len;
+ bool mroute_do_assert;
+ bool mroute_do_pim;
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+ int mroute_reg_vif_num;
+#endif
+};
+
+struct ipmr_rule {
+ struct fib_rule common;
+};
+
+struct ipmr_result {
+ struct mr_table *mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ * Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ * Multicast router control variables
+ */
+
+#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __read_mostly;
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static void ipmr_free_table(struct mr_table *mrt);
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local);
+static int ipmr_cache_report(struct mr_table *mrt,
+ struct sk_buff *pkt, vifi_t vifi, int assert);
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mfc_cache *c, struct rtmsg *rtm);
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd);
+static void mroute_clean_tables(struct mr_table *mrt);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ ipmr_for_each_table(mrt, net) {
+ if (mrt->id == id)
+ return mrt;
+ }
+ return NULL;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+ struct mr_table **mrt)
+{
+ int err;
+ struct ipmr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+ flowi4_to_flowi(flp4), 0, &arg);
+ if (err < 0)
+ return err;
+ *mrt = res.mrt;
+ return 0;
+}
+
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct ipmr_result *res = arg->result;
+ struct mr_table *mrt;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ mrt = ipmr_get_table(rule->fr_net, rule->table);
+ if (!mrt)
+ return -EAGAIN;
+ res->mrt = mrt;
+ return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ return 1;
+}
+
+static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
+ FRA_GENERIC_POLICY,
+};
+
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+ return 0;
+}
+
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ return 1;
+}
+
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ frh->dst_len = 0;
+ frh->src_len = 0;
+ frh->tos = 0;
+ return 0;
+}
+
+static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .rule_size = sizeof(struct ipmr_rule),
+ .addr_size = sizeof(u32),
+ .action = ipmr_rule_action,
+ .match = ipmr_rule_match,
+ .configure = ipmr_rule_configure,
+ .compare = ipmr_rule_compare,
+ .default_pref = fib_default_rule_pref,
+ .fill = ipmr_rule_fill,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .policy = ipmr_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ struct mr_table *mrt;
+ int err;
+
+ ops = fib_rules_register(&ipmr_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ INIT_LIST_HEAD(&net->ipv4.mr_tables);
+
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (!mrt) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ goto err2;
+
+ net->ipv4.mr_rules_ops = ops;
+ return 0;
+
+err2:
+ ipmr_free_table(mrt);
+err1:
+ fib_rules_unregister(ops);
+ return err;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ struct mr_table *mrt, *next;
+
+ rtnl_lock();
+ list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
+ list_del(&mrt->list);
+ ipmr_free_table(mrt);
+ }
+ fib_rules_unregister(net->ipv4.mr_rules_ops);
+ rtnl_unlock();
+}
+#else
+#define ipmr_for_each_table(mrt, net) \
+ for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ return net->ipv4.mrt;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+ struct mr_table **mrt)
+{
+ *mrt = net->ipv4.mrt;
+ return 0;
+}
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ return net->ipv4.mrt ? 0 : -ENOMEM;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ rtnl_lock();
+ ipmr_free_table(net->ipv4.mrt);
+ net->ipv4.mrt = NULL;
+ rtnl_unlock();
+}
+#endif
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+ unsigned int i;
+
+ mrt = ipmr_get_table(net, id);
+ if (mrt)
+ return mrt;
+
+ mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+ if (!mrt)
+ return NULL;
+ write_pnet(&mrt->net, net);
+ mrt->id = id;
+
+ /* Forwarding cache */
+ for (i = 0; i < MFC_LINES; i++)
+ INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
+
+ INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+ setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+ (unsigned long)mrt);
+
+#ifdef CONFIG_IP_PIMSM
+ mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+ return mrt;
+}
+
+static void ipmr_free_table(struct mr_table *mrt)
+{
+ del_timer_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt);
+ kfree(mrt);
+}
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
+{
+ struct net *net = dev_net(dev);
+
+ dev_close(dev);
+
+ dev = __dev_get_by_name(net, "tunl0");
+ if (dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct ifreq ifr;
+ struct ip_tunnel_parm p;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+ ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+ if (ops->ndo_do_ioctl) {
+ mm_segment_t oldfs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
+ set_fs(oldfs);
+ }
+ }
+}
+
+static
+struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+{
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(net, "tunl0");
+
+ if (dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+ struct ifreq ifr;
+ struct ip_tunnel_parm p;
+ struct in_device *in_dev;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+ ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+ if (ops->ndo_do_ioctl) {
+ mm_segment_t oldfs = get_fs();
+
+ set_fs(KERNEL_DS);
+ err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+ set_fs(oldfs);
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ dev = NULL;
+
+ if (err == 0 &&
+ (dev = __dev_get_by_name(net, p.name)) != NULL) {
+ dev->flags |= IFF_MULTICAST;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ goto failure;
+
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+ if (dev_open(dev))
+ goto failure;
+ dev_hold(dev);
+ }
+ }
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+
+#ifdef CONFIG_IP_PIMSM
+
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct flowi4 fl4 = {
+ .flowi4_oif = dev->ifindex,
+ .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi4_mark = skb->mark,
+ };
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ read_lock(&mrt_lock);
+ dev->stats.tx_bytes += skb->len;
+ dev->stats.tx_packets++;
+ ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int reg_vif_get_iflink(const struct net_device *dev)
+{
+ return 0;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+ .ndo_start_xmit = reg_vif_xmit,
+ .ndo_get_iflink = reg_vif_get_iflink,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+ dev->type = ARPHRD_PIMREG;
+ dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
+ dev->flags = IFF_NOARP;
+ dev->netdev_ops = &reg_vif_netdev_ops;
+ dev->destructor = free_netdev;
+ dev->features |= NETIF_F_NETNS_LOCAL;
+}
+
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+ struct net_device *dev;
+ struct in_device *in_dev;
+ char name[IFNAMSIZ];
+
+ if (mrt->id == RT_TABLE_DEFAULT)
+ sprintf(name, "pimreg");
+ else
+ sprintf(name, "pimreg%u", mrt->id);
+
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
+
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ if (register_netdevice(dev)) {
+ free_netdev(dev);
+ return NULL;
+ }
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ goto failure;
+ }
+
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+ rcu_read_unlock();
+
+ if (dev_open(dev))
+ goto failure;
+
+ dev_hold(dev);
+
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+#endif
+
+/**
+ * vif_delete - Delete a VIF entry
+ * @notify: Set to 1, if the caller is a notifier_call
+ */
+
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
+ struct list_head *head)
+{
+ struct vif_device *v;
+ struct net_device *dev;
+ struct in_device *in_dev;
+
+ if (vifi < 0 || vifi >= mrt->maxvif)
+ return -EADDRNOTAVAIL;
+
+ v = &mrt->vif_table[vifi];
+
+ write_lock_bh(&mrt_lock);
+ dev = v->dev;
+ v->dev = NULL;
+
+ if (!dev) {
+ write_unlock_bh(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ }
+
+#ifdef CONFIG_IP_PIMSM
+ if (vifi == mrt->mroute_reg_vif_num)
+ mrt->mroute_reg_vif_num = -1;
+#endif
+
+ if (vifi + 1 == mrt->maxvif) {
+ int tmp;
+
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
+ if (VIF_EXISTS(mrt, tmp))
+ break;
+ }
+ mrt->maxvif = tmp+1;
+ }
+
+ write_unlock_bh(&mrt_lock);
+
+ dev_set_allmulti(dev, -1);
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ ip_rt_multicast_event(in_dev);
+ }
+
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
+ unregister_netdevice_queue(dev, head);
+
+ dev_put(dev);
+ return 0;
+}
+
+static void ipmr_cache_free_rcu(struct rcu_head *head)
+{
+ struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
+ kmem_cache_free(mrt_cachep, c);
+}
+
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+ call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ * and reporting error to netlink readers.
+ */
+
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ struct nlmsgerr *e;
+
+ atomic_dec(&mrt->cache_resolve_queue_len);
+
+ while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
+ if (ip_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ e = nlmsg_data(nlh);
+ e->error = -ETIMEDOUT;
+ memset(&e->msg, 0, sizeof(e->msg));
+
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
+ kfree_skb(skb);
+ }
+ }
+
+ ipmr_cache_free(c);
+}
+
+
+/* Timer process for the unresolved queue. */
+
+static void ipmr_expire_process(unsigned long arg)
+{
+ struct mr_table *mrt = (struct mr_table *)arg;
+ unsigned long now;
+ unsigned long expires;
+ struct mfc_cache *c, *next;
+
+ if (!spin_trylock(&mfc_unres_lock)) {
+ mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
+ return;
+ }
+
+ if (list_empty(&mrt->mfc_unres_queue))
+ goto out;
+
+ now = jiffies;
+ expires = 10*HZ;
+
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ if (time_after(c->mfc_un.unres.expires, now)) {
+ unsigned long interval = c->mfc_un.unres.expires - now;
+ if (interval < expires)
+ expires = interval;
+ continue;
+ }
+
+ list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, c);
+ }
+
+ if (!list_empty(&mrt->mfc_unres_queue))
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+
+out:
+ spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
+ unsigned char *ttls)
+{
+ int vifi;
+
+ cache->mfc_un.res.minvif = MAXVIFS;
+ cache->mfc_un.res.maxvif = 0;
+ memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+ for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+ if (VIF_EXISTS(mrt, vifi) &&
+ ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+ if (cache->mfc_un.res.minvif > vifi)
+ cache->mfc_un.res.minvif = vifi;
+ if (cache->mfc_un.res.maxvif <= vifi)
+ cache->mfc_un.res.maxvif = vifi + 1;
+ }
+ }
+}
+
+static int vif_add(struct net *net, struct mr_table *mrt,
+ struct vifctl *vifc, int mrtsock)
+{
+ int vifi = vifc->vifc_vifi;
+ struct vif_device *v = &mrt->vif_table[vifi];
+ struct net_device *dev;
+ struct in_device *in_dev;
+ int err;
+
+ /* Is vif busy ? */
+ if (VIF_EXISTS(mrt, vifi))
+ return -EADDRINUSE;
+
+ switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+ case VIFF_REGISTER:
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (mrt->mroute_reg_vif_num >= 0)
+ return -EADDRINUSE;
+ dev = ipmr_reg_vif(net, mrt);
+ if (!dev)
+ return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ unregister_netdevice(dev);
+ dev_put(dev);
+ return err;
+ }
+ break;
+#endif
+ case VIFF_TUNNEL:
+ dev = ipmr_new_tunnel(net, vifc);
+ if (!dev)
+ return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ ipmr_del_tunnel(dev, vifc);
+ dev_put(dev);
+ return err;
+ }
+ break;
+
+ case VIFF_USE_IFINDEX:
+ case 0:
+ if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+ dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+ if (dev && !__in_dev_get_rtnl(dev)) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ } else {
+ dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+ }
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ dev_put(dev);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
+ &in_dev->cnf);
+ ip_rt_multicast_event(in_dev);
+
+ /* Fill in the VIF structures */
+
+ v->rate_limit = vifc->vifc_rate_limit;
+ v->local = vifc->vifc_lcl_addr.s_addr;
+ v->remote = vifc->vifc_rmt_addr.s_addr;
+ v->flags = vifc->vifc_flags;
+ if (!mrtsock)
+ v->flags |= VIFF_STATIC;
+ v->threshold = vifc->vifc_threshold;
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->link = dev->ifindex;
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
+ v->link = dev_get_iflink(dev);
+
+ /* And finish update writing critical data */
+ write_lock_bh(&mrt_lock);
+ v->dev = dev;
+#ifdef CONFIG_IP_PIMSM
+ if (v->flags & VIFF_REGISTER)
+ mrt->mroute_reg_vif_num = vifi;
+#endif
+ if (vifi+1 > mrt->maxvif)
+ mrt->maxvif = vifi+1;
+ write_unlock_bh(&mrt_lock);
+ return 0;
+}
+
+/* called with rcu_read_lock() */
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
+ __be32 origin,
+ __be32 mcastgrp)
+{
+ int line = MFC_HASH(mcastgrp, origin);
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
+ return c;
+ }
+ return NULL;
+}
+
+/* Look for a (*,*,oif) entry */
+static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
+ int vifi)
+{
+ int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY) &&
+ c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ return NULL;
+}
+
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+ __be32 mcastgrp, int vifi)
+{
+ int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache *c, *proxy;
+
+ if (mcastgrp == htonl(INADDR_ANY))
+ goto skip;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == mcastgrp) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt,
+ c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
+
+skip:
+ return ipmr_cache_find_any_parent(mrt, vifi);
+}
+
+/*
+ * Allocate a multicast cache entry
+ */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+ struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+
+ if (c)
+ c->mfc_un.res.minvif = MAXVIFS;
+ return c;
+}
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+ struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+
+ if (c) {
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ }
+ return c;
+}
+
+/*
+ * A cache entry has gone into a resolved state from queued
+ */
+
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+ struct mfc_cache *uc, struct mfc_cache *c)
+{
+ struct sk_buff *skb;
+ struct nlmsgerr *e;
+
+ /* Play the pending entries through our router */
+
+ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+ if (ip_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+
+ if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
+ nlh->nlmsg_len = skb_tail_pointer(skb) -
+ (u8 *)nlh;
+ } else {
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ e = nlmsg_data(nlh);
+ e->error = -EMSGSIZE;
+ memset(&e->msg, 0, sizeof(e->msg));
+ }
+
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
+ ip_mr_forward(net, mrt, skb, c, 0);
+ }
+ }
+}
+
+/*
+ * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ * expects the following bizarre scheme.
+ *
+ * Called under mrt_lock.
+ */
+
+static int ipmr_cache_report(struct mr_table *mrt,
+ struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+ struct sk_buff *skb;
+ const int ihl = ip_hdrlen(pkt);
+ struct igmphdr *igmp;
+ struct igmpmsg *msg;
+ struct sock *mroute_sk;
+ int ret;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT)
+ skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+ else
+#endif
+ skb = alloc_skb(128, GFP_ATOMIC);
+
+ if (!skb)
+ return -ENOBUFS;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT) {
+ /* Ugly, but we have no choice with this interface.
+ * Duplicate old header, fix ihl, length etc.
+ * And all this only to mangle msg->im_msgtype and
+ * to set msg->im_mbz to "mbz" :-)
+ */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
+ msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_mbz = 0;
+ msg->im_vif = mrt->mroute_reg_vif_num;
+ ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
+ sizeof(struct iphdr));
+ } else
+#endif
+ {
+
+ /* Copy the IP header */
+
+ skb_set_network_header(skb, skb->len);
+ skb_put(skb, ihl);
+ skb_copy_to_linear_data(skb, pkt->data, ihl);
+ ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ msg->im_vif = vifi;
+ skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+
+ /* Add our header */
+
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp->type =
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ skb->transport_header = skb->network_header;
+ }
+
+ rcu_read_lock();
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute_sk) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* Deliver to mrouted */
+
+ ret = sock_queue_rcv_skb(mroute_sk, skb);
+ rcu_read_unlock();
+ if (ret < 0) {
+ net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/*
+ * Queue a packet for resolution. It gets locked cache entry!
+ */
+
+static int
+ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
+{
+ bool found = false;
+ int err;
+ struct mfc_cache *c;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
+ if (c->mfc_mcastgrp == iph->daddr &&
+ c->mfc_origin == iph->saddr) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ /* Create a new entry if allowable */
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+ (c = ipmr_cache_alloc_unres()) == NULL) {
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ /* Fill in the new cache entry */
+
+ c->mfc_parent = -1;
+ c->mfc_origin = iph->saddr;
+ c->mfc_mcastgrp = iph->daddr;
+
+ /* Reflect first query at mrouted. */
+
+ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
+ if (err < 0) {
+ /* If the report failed throw the cache entry
+ out - Brad Parker
+ */
+ spin_unlock_bh(&mfc_unres_lock);
+
+ ipmr_cache_free(c);
+ kfree_skb(skb);
+ return err;
+ }
+
+ atomic_inc(&mrt->cache_resolve_queue_len);
+ list_add(&c->list, &mrt->mfc_unres_queue);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
+ mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
+ }
+
+ /* See if we can append the packet */
+
+ if (c->mfc_un.unres.unresolved.qlen > 3) {
+ kfree_skb(skb);
+ err = -ENOBUFS;
+ } else {
+ skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+ err = 0;
+ }
+
+ spin_unlock_bh(&mfc_unres_lock);
+ return err;
+}
+
+/*
+ * MFC cache manipulation by user space mroute daemon
+ */
+
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
+{
+ int line;
+ struct mfc_cache *c, *next;
+
+ line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+ struct mfcctl *mfc, int mrtsock, int parent)
+{
+ bool found = false;
+ int line;
+ struct mfc_cache *uc, *c;
+
+ if (mfc->mfcc_parent >= MAXVIFS)
+ return -ENFILE;
+
+ line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ write_lock_bh(&mrt_lock);
+ c->mfc_parent = mfc->mfcc_parent;
+ ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+ write_unlock_bh(&mrt_lock);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+ }
+
+ if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+ !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+ return -EINVAL;
+
+ c = ipmr_cache_alloc();
+ if (!c)
+ return -ENOMEM;
+
+ c->mfc_origin = mfc->mfcc_origin.s_addr;
+ c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
+ c->mfc_parent = mfc->mfcc_parent;
+ ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+
+ list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
+
+ /*
+ * Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
+ */
+ found = false;
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
+ if (uc->mfc_origin == c->mfc_origin &&
+ uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+ list_del(&uc->list);
+ atomic_dec(&mrt->cache_resolve_queue_len);
+ found = true;
+ break;
+ }
+ }
+ if (list_empty(&mrt->mfc_unres_queue))
+ del_timer(&mrt->ipmr_expire_timer);
+ spin_unlock_bh(&mfc_unres_lock);
+
+ if (found) {
+ ipmr_cache_resolve(net, mrt, uc, c);
+ ipmr_cache_free(uc);
+ }
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+}
+
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr_table *mrt)
+{
+ int i;
+ LIST_HEAD(list);
+ struct mfc_cache *c, *next;
+
+ /* Shut down all active vif entries */
+
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (!(mrt->vif_table[i].flags & VIFF_STATIC))
+ vif_delete(mrt, i, 0, &list);
+ }
+ unregister_netdevice_many(&list);
+
+ /* Wipe the cache */
+
+ for (i = 0; i < MFC_LINES; i++) {
+ list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
+ if (c->mfc_flags & MFC_STATIC)
+ continue;
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
+ }
+ }
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, c);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ }
+}
+
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
+static void mrtsock_destruct(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ rtnl_lock();
+ ipmr_for_each_table(mrt, net) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ RCU_INIT_POINTER(mrt->mroute_sk, NULL);
+ mroute_clean_tables(mrt);
+ }
+ }
+ rtnl_unlock();
+}
+
+/*
+ * Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+{
+ int ret, parent = 0;
+ struct vifctl vif;
+ struct mfcctl mfc;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ if (optname != MRT_INIT) {
+ if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EACCES;
+ }
+
+ switch (optname) {
+ case MRT_INIT:
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ rtnl_lock();
+ if (rtnl_dereference(mrt->mroute_sk)) {
+ rtnl_unlock();
+ return -EADDRINUSE;
+ }
+
+ ret = ip_ra_control(sk, 1, mrtsock_destruct);
+ if (ret == 0) {
+ rcu_assign_pointer(mrt->mroute_sk, sk);
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ }
+ rtnl_unlock();
+ return ret;
+ case MRT_DONE:
+ if (sk != rcu_access_pointer(mrt->mroute_sk))
+ return -EACCES;
+ return ip_ra_control(sk, 0, NULL);
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ if (optlen != sizeof(vif))
+ return -EINVAL;
+ if (copy_from_user(&vif, optval, sizeof(vif)))
+ return -EFAULT;
+ if (vif.vifc_vifi >= MAXVIFS)
+ return -ENFILE;
+ rtnl_lock();
+ if (optname == MRT_ADD_VIF) {
+ ret = vif_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
+ } else {
+ ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
+ }
+ rtnl_unlock();
+ return ret;
+
+ /*
+ * Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ parent = -1;
+ case MRT_ADD_MFC_PROXY:
+ case MRT_DEL_MFC_PROXY:
+ if (optlen != sizeof(mfc))
+ return -EINVAL;
+ if (copy_from_user(&mfc, optval, sizeof(mfc)))
+ return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mfcc_parent;
+ rtnl_lock();
+ if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+ ret = ipmr_mfc_delete(mrt, &mfc, parent);
+ else
+ ret = ipmr_mfc_add(net, mrt, &mfc,
+ sk == rtnl_dereference(mrt->mroute_sk),
+ parent);
+ rtnl_unlock();
+ return ret;
+ /*
+ * Control PIM assert.
+ */
+ case MRT_ASSERT:
+ {
+ int v;
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (get_user(v, (int __user *)optval))
+ return -EFAULT;
+ mrt->mroute_do_assert = v;
+ return 0;
+ }
+#ifdef CONFIG_IP_PIMSM
+ case MRT_PIM:
+ {
+ int v;
+
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (get_user(v, (int __user *)optval))
+ return -EFAULT;
+ v = !!v;
+
+ rtnl_lock();
+ ret = 0;
+ if (v != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = v;
+ mrt->mroute_do_assert = v;
+ }
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ case MRT_TABLE:
+ {
+ u32 v;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (get_user(v, (u32 __user *)optval))
+ return -EFAULT;
+
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (v != RT_TABLE_DEFAULT && v >= 1000000000)
+ return -EINVAL;
+
+ rtnl_lock();
+ ret = 0;
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EBUSY;
+ } else {
+ if (!ipmr_new_table(net, v))
+ ret = -ENOMEM;
+ else
+ raw_sk(sk)->ipmr_table = v;
+ }
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+ /*
+ * Spurious command, or MRT_VERSION which you cannot
+ * set.
+ */
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+/*
+ * Getsock opt support for the multicast routing system.
+ */
+
+int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
+{
+ int olr;
+ int val;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ if (optname != MRT_VERSION &&
+#ifdef CONFIG_IP_PIMSM
+ optname != MRT_PIM &&
+#endif
+ optname != MRT_ASSERT)
+ return -ENOPROTOOPT;
+
+ if (get_user(olr, optlen))
+ return -EFAULT;
+
+ olr = min_t(unsigned int, olr, sizeof(int));
+ if (olr < 0)
+ return -EINVAL;
+
+ if (put_user(olr, optlen))
+ return -EFAULT;
+ if (optname == MRT_VERSION)
+ val = 0x0305;
+#ifdef CONFIG_IP_PIMSM
+ else if (optname == MRT_PIM)
+ val = mrt->mroute_do_pim;
+#endif
+ else
+ val = mrt->mroute_do_assert;
+ if (copy_to_user(optval, &val, olr))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The IP multicast ioctl support routines.
+ */
+
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+ struct sioc_sg_req sr;
+ struct sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+ vifi_t vifi; /* Which iface */
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req sr;
+ struct compat_sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct vif_device *v;
+ int ct;
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ ipmr_for_each_table(mrt, net) {
+ v = &mrt->vif_table[0];
+ for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+ if (v->dev == dev)
+ vif_delete(mrt, ct, 1, NULL);
+ }
+ }
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block ip_mr_notifier = {
+ .notifier_call = ipmr_device_event,
+};
+
+/*
+ * Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
+ */
+
+static void ip_encap(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr)
+{
+ struct iphdr *iph;
+ const struct iphdr *old_iph = ip_hdr(skb);
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb->transport_header = skb->network_header;
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->tos = old_iph->tos;
+ iph->ttl = old_iph->ttl;
+ iph->frag_off = 0;
+ iph->daddr = daddr;
+ iph->saddr = saddr;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(net, skb, NULL);
+ ip_send_check(iph);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ nf_reset(skb);
+}
+
+static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ return dst_output_sk(sk, skb);
+}
+
+/*
+ * Processing handlers for ipmr_forward
+ */
+
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *c, int vifi)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct vif_device *vif = &mrt->vif_table[vifi];
+ struct net_device *dev;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ int encap = 0;
+
+ if (!vif->dev)
+ goto out_free;
+
+#ifdef CONFIG_IP_PIMSM
+ if (vif->flags & VIFF_REGISTER) {
+ vif->pkt_out++;
+ vif->bytes_out += skb->len;
+ vif->dev->stats.tx_bytes += skb->len;
+ vif->dev->stats.tx_packets++;
+ ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
+ goto out_free;
+ }
+#endif
+
+ if (vif->flags & VIFF_TUNNEL) {
+ rt = ip_route_output_ports(net, &fl4, NULL,
+ vif->remote, vif->local,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
+ goto out_free;
+ encap = sizeof(struct iphdr);
+ } else {
+ rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
+ goto out_free;
+ }
+
+ dev = rt->dst.dev;
+
+ if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ /* Do not fragment multicasts. Alas, IPv4 does not
+ * allow to send ICMP, so that packets will disappear
+ * to blackhole.
+ */
+
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ ip_rt_put(rt);
+ goto out_free;
+ }
+
+ encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+
+ if (skb_cow(skb, encap)) {
+ ip_rt_put(rt);
+ goto out_free;
+ }
+
+ vif->pkt_out++;
+ vif->bytes_out += skb->len;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ ip_decrease_ttl(ip_hdr(skb));
+
+ /* FIXME: forward and output firewalls used to be called here.
+ * What do we do with netfilter? -- RR
+ */
+ if (vif->flags & VIFF_TUNNEL) {
+ ip_encap(net, skb, vif->local, vif->remote);
+ /* FIXME: extra output firewall step used to be here. --RR */
+ vif->dev->stats.tx_packets++;
+ vif->dev->stats.tx_bytes += skb->len;
+ }
+
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+ /*
+ * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ * not only before forwarding, but after forwarding on all output
+ * interfaces. It is clear, if mrouter runs a multicasting
+ * program, it should receive packets not depending to what interface
+ * program is joined.
+ * If we will not make it, the program will have to join on all
+ * interfaces. On the other hand, multihoming host (or router, but
+ * not mrouter) cannot join to more than one interface - it will
+ * result in receiving multiple packets.
+ */
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
+ skb->dev, dev,
+ ipmr_forward_finish);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
+static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
+{
+ int ct;
+
+ for (ct = mrt->maxvif-1; ct >= 0; ct--) {
+ if (mrt->vif_table[ct].dev == dev)
+ break;
+ }
+ return ct;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local)
+{
+ int psend = -1;
+ int vif, ct;
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
+
+ vif = cache->mfc_parent;
+ cache->mfc_un.res.pkt++;
+ cache->mfc_un.res.bytes += skb->len;
+
+ if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+ struct mfc_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incomming
+ * interface is part of the static tree.
+ */
+ cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
+ /*
+ * Wrong interface: drop packet and (maybe) send PIM assert.
+ */
+ if (mrt->vif_table[vif].dev != skb->dev) {
+ if (rt_is_output_route(skb_rtable(skb))) {
+ /* It is our own packet, looped back.
+ * Very complicated situation...
+ *
+ * The best workaround until routing daemons will be
+ * fixed is not to redistribute packet, if it was
+ * send through wrong interface. It means, that
+ * multicast applications WILL NOT work for
+ * (S,G), which have default multicast route pointing
+ * to wrong oif. In any case, it is not a good
+ * idea to use multicasting applications on router.
+ */
+ goto dont_forward;
+ }
+
+ cache->mfc_un.res.wrong_if++;
+
+ if (true_vifi >= 0 && mrt->mroute_do_assert &&
+ /* pimsm uses asserts, when switching from RPT to SPT,
+ * so that we cannot check that packet arrived on an oif.
+ * It is bad, but otherwise we would need to move pretty
+ * large chunk of pimd to kernel. Ough... --ANK
+ */
+ (mrt->mroute_do_pim ||
+ cache->mfc_un.res.ttls[true_vifi] < 255) &&
+ time_after(jiffies,
+ cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+ cache->mfc_un.res.last_assert = jiffies;
+ ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ }
+ goto dont_forward;
+ }
+
+forward:
+ mrt->vif_table[vif].pkt_in++;
+ mrt->vif_table[vif].bytes_in += skb->len;
+
+ /*
+ * Forward the frame
+ */
+ if (cache->mfc_origin == htonl(INADDR_ANY) &&
+ cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (true_vifi >= 0 &&
+ true_vifi != cache->mfc_parent &&
+ ip_hdr(skb)->ttl >
+ cache->mfc_un.res.ttls[cache->mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = cache->mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = cache->mfc_un.res.maxvif - 1;
+ ct >= cache->mfc_un.res.minvif; ct--) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+ ct != true_vifi) &&
+ ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ if (skb2)
+ ipmr_queue_xmit(net, mrt, skb2, cache,
+ psend);
+ }
+ psend = ct;
+ }
+ }
+last_forward:
+ if (psend != -1) {
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ if (skb2)
+ ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+ } else {
+ ipmr_queue_xmit(net, mrt, skb, cache, psend);
+ return;
+ }
+ }
+
+dont_forward:
+ if (!local)
+ kfree_skb(skb);
+}
+
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_oif = (rt_is_output_route(rt) ?
+ skb->dev->ifindex : 0),
+ .flowi4_iif = (rt_is_output_route(rt) ?
+ LOOPBACK_IFINDEX :
+ skb->dev->ifindex),
+ .flowi4_mark = skb->mark,
+ };
+ struct mr_table *mrt;
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err)
+ return ERR_PTR(err);
+ return mrt;
+}
+
+/*
+ * Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+
+int ip_mr_input(struct sk_buff *skb)
+{
+ struct mfc_cache *cache;
+ struct net *net = dev_net(skb->dev);
+ int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+ struct mr_table *mrt;
+
+ /* Packet is looped back after forward, it should not be
+ * forwarded second time, but still can be delivered locally.
+ */
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
+ goto dont_forward;
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt)) {
+ kfree_skb(skb);
+ return PTR_ERR(mrt);
+ }
+ if (!local) {
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ * Cisco IOS <= 11.2(8)) do not put router alert
+ * option to IGMP packets destined to routable
+ * groups. It is very bad, because it means
+ * that we can forward NO IGMP messages.
+ */
+ struct sock *mroute_sk;
+
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk) {
+ nf_reset(skb);
+ raw_rcv(mroute_sk, skb);
+ return 0;
+ }
+ }
+ }
+
+ /* already under rcu_read_lock() */
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (!cache) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
+
+ /*
+ * No usable cache entry
+ */
+ if (!cache) {
+ int vif;
+
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ ip_local_deliver(skb);
+ if (!skb2)
+ return -ENOBUFS;
+ skb = skb2;
+ }
+
+ read_lock(&mrt_lock);
+ vif = ipmr_find_vif(mrt, skb->dev);
+ if (vif >= 0) {
+ int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+ read_unlock(&mrt_lock);
+
+ return err2;
+ }
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ read_lock(&mrt_lock);
+ ip_mr_forward(net, mrt, skb, cache, local);
+ read_unlock(&mrt_lock);
+
+ if (local)
+ return ip_local_deliver(skb);
+
+ return 0;
+
+dont_forward:
+ if (local)
+ return ip_local_deliver(skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+ unsigned int pimlen)
+{
+ struct net_device *reg_dev = NULL;
+ struct iphdr *encap;
+
+ encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+ /*
+ * Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
+ */
+ if (!ipv4_is_multicast(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + pimlen > skb->len)
+ return 1;
+
+ read_lock(&mrt_lock);
+ if (mrt->mroute_reg_vif_num >= 0)
+ reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
+ read_unlock(&mrt_lock);
+
+ if (!reg_dev)
+ return 1;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ return NET_RX_SUCCESS;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+
+int pim_rcv_v1(struct sk_buff *skb)
+{
+ struct igmphdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+ goto drop;
+
+ pim = igmp_hdr(skb);
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto drop;
+ if (!mrt->mroute_do_pim ||
+ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
+ goto drop;
+
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+ kfree_skb(skb);
+ }
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff *skb)
+{
+ struct pimreghdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+ goto drop;
+
+ pim = (struct pimreghdr *)skb_transport_header(skb);
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
+ (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+ csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+ goto drop;
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto drop;
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+ kfree_skb(skb);
+ }
+ return 0;
+}
+#endif
+
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mfc_cache *c, struct rtmsg *rtm)
+{
+ int ct;
+ struct rtnexthop *nhp;
+ struct nlattr *mp_attr;
+ struct rta_mfc_stats mfcs;
+
+ /* If cache is unresolved, don't try to parse IIF and OIF */
+ if (c->mfc_parent >= MAXVIFS)
+ return -ENOENT;
+
+ if (VIF_EXISTS(mrt, c->mfc_parent) &&
+ nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ return -EMSGSIZE;
+
+ if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+ return -EMSGSIZE;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+ if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
+ nla_nest_cancel(skb, mp_attr);
+ return -EMSGSIZE;
+ }
+
+ nhp->rtnh_flags = 0;
+ nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+ nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
+ nhp->rtnh_len = sizeof(*nhp);
+ }
+ }
+
+ nla_nest_end(skb, mp_attr);
+
+ mfcs.mfcs_packets = c->mfc_un.res.pkt;
+ mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+ mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ return -EMSGSIZE;
+
+ rtm->rtm_type = RTN_MULTICAST;
+ return 1;
+}
+
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr,
+ struct rtmsg *rtm, int nowait)
+{
+ struct mfc_cache *cache;
+ struct mr_table *mrt;
+ int err;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ rcu_read_lock();
+ cache = ipmr_cache_find(mrt, saddr, daddr);
+ if (!cache && skb->dev) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, daddr, vif);
+ }
+ if (!cache) {
+ struct sk_buff *skb2;
+ struct iphdr *iph;
+ struct net_device *dev;
+ int vif = -1;
+
+ if (nowait) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+
+ dev = skb->dev;
+ read_lock(&mrt_lock);
+ if (dev)
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif < 0) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
+ skb_push(skb2, sizeof(struct iphdr));
+ skb_reset_network_header(skb2);
+ iph = ip_hdr(skb2);
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+ iph->version = 0;
+ err = ipmr_cache_unresolved(mrt, vif, skb2);
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return err;
+ }
+
+ read_lock(&mrt_lock);
+ if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
+ cache->mfc_flags |= MFC_NOTIFY;
+ err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return err;
+}
+
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mfc_cache *c, int cmd,
+ int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+ int err;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = RTNL_FAMILY_IPMR;
+ rtm->rtm_dst_len = 32;
+ rtm->rtm_src_len = 32;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = mrt->id;
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
+ rtm->rtm_type = RTN_MULTICAST;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (c->mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
+ rtm->rtm_flags = 0;
+
+ if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
+ goto nla_put_failure;
+ err = __ipmr_fill_mroute(mrt, skb, c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static size_t mroute_msgsize(bool unresolved, int maxvif)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_SRC */
+ + nla_total_size(4) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
+}
+
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
+ GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+}
+
+static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct mr_table *mrt;
+ struct mfc_cache *mfc;
+ unsigned int t = 0, s_t;
+ unsigned int h = 0, s_h;
+ unsigned int e = 0, s_e;
+
+ s_t = cb->args[0];
+ s_h = cb->args[1];
+ s_e = cb->args[2];
+
+ rcu_read_lock();
+ ipmr_for_each_table(mrt, net) {
+ if (t < s_t)
+ goto next_table;
+ if (t > s_t)
+ s_h = 0;
+ for (h = s_h; h < MFC_LINES; h++) {
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
+ goto done;
+next_entry:
+ e++;
+ }
+ e = s_e = 0;
+ }
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0) {
+ spin_unlock_bh(&mfc_unres_lock);
+ goto done;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ e = s_e = 0;
+ s_h = 0;
+next_table:
+ t++;
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[2] = e;
+ cb->args[1] = h;
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+ */
+struct ipmr_vif_iter {
+ struct seq_net_private p;
+ struct mr_table *mrt;
+ int ct;
+};
+
+static struct vif_device *ipmr_vif_seq_idx(struct net *net,
+ struct ipmr_vif_iter *iter,
+ loff_t pos)
+{
+ struct mr_table *mrt = iter->mrt;
+
+ for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+ if (!VIF_EXISTS(mrt, iter->ct))
+ continue;
+ if (pos-- == 0)
+ return &mrt->vif_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(mrt_lock)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ iter->mrt = mrt;
+
+ read_lock(&mrt_lock);
+ return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = iter->mrt;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ipmr_vif_seq_idx(net, iter, 0);
+
+ while (++iter->ct < mrt->maxvif) {
+ if (!VIF_EXISTS(mrt, iter->ct))
+ continue;
+ return &mrt->vif_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+ __releases(mrt_lock)
+{
+ read_unlock(&mrt_lock);
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct mr_table *mrt = iter->mrt;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
+ } else {
+ const struct vif_device *vif = v;
+ const char *name = vif->dev ? vif->dev->name : "none";
+
+ seq_printf(seq,
+ "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ vif - mrt->vif_table,
+ name, vif->bytes_in, vif->pkt_in,
+ vif->bytes_out, vif->pkt_out,
+ vif->flags, vif->local, vif->remote);
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_vif_seq_ops = {
+ .start = ipmr_vif_seq_start,
+ .next = ipmr_vif_seq_next,
+ .stop = ipmr_vif_seq_stop,
+ .show = ipmr_vif_seq_show,
+};
+
+static int ipmr_vif_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ipmr_vif_seq_ops,
+ sizeof(struct ipmr_vif_iter));
+}
+
+static const struct file_operations ipmr_vif_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_vif_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct ipmr_mfc_iter {
+ struct seq_net_private p;
+ struct mr_table *mrt;
+ struct list_head *cache;
+ int ct;
+};
+
+
+static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
+ struct ipmr_mfc_iter *it, loff_t pos)
+{
+ struct mr_table *mrt = it->mrt;
+ struct mfc_cache *mfc;
+
+ rcu_read_lock();
+ for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
+ it->cache = &mrt->mfc_cache_array[it->ct];
+ list_for_each_entry_rcu(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ }
+ rcu_read_unlock();
+
+ spin_lock_bh(&mfc_unres_lock);
+ it->cache = &mrt->mfc_unres_queue;
+ list_for_each_entry(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ spin_unlock_bh(&mfc_unres_lock);
+
+ it->cache = NULL;
+ return NULL;
+}
+
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ it->mrt = mrt;
+ it->cache = NULL;
+ it->ct = 0;
+ return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct mfc_cache *mfc = v;
+ struct ipmr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = it->mrt;
+
+ ++*pos;
+
+ if (v == SEQ_START_TOKEN)
+ return ipmr_mfc_seq_idx(net, seq->private, 0);
+
+ if (mfc->list.next != it->cache)
+ return list_entry(mfc->list.next, struct mfc_cache, list);
+
+ if (it->cache == &mrt->mfc_unres_queue)
+ goto end_of_list;
+
+ BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
+
+ while (++it->ct < MFC_LINES) {
+ it->cache = &mrt->mfc_cache_array[it->ct];
+ if (list_empty(it->cache))
+ continue;
+ return list_first_entry(it->cache, struct mfc_cache, list);
+ }
+
+ /* exhausted cache_array, show unresolved */
+ rcu_read_unlock();
+ it->cache = &mrt->mfc_unres_queue;
+ it->ct = 0;
+
+ spin_lock_bh(&mfc_unres_lock);
+ if (!list_empty(it->cache))
+ return list_first_entry(it->cache, struct mfc_cache, list);
+
+end_of_list:
+ spin_unlock_bh(&mfc_unres_lock);
+ it->cache = NULL;
+
+ return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ struct mr_table *mrt = it->mrt;
+
+ if (it->cache == &mrt->mfc_unres_queue)
+ spin_unlock_bh(&mfc_unres_lock);
+ else if (it->cache == &mrt->mfc_cache_array[it->ct])
+ rcu_read_unlock();
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+ int n;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Group Origin Iif Pkts Bytes Wrong Oifs\n");
+ } else {
+ const struct mfc_cache *mfc = v;
+ const struct ipmr_mfc_iter *it = seq->private;
+ const struct mr_table *mrt = it->mrt;
+
+ seq_printf(seq, "%08X %08X %-3hd",
+ (__force u32) mfc->mfc_mcastgrp,
+ (__force u32) mfc->mfc_origin,
+ mfc->mfc_parent);
+
+ if (it->cache != &mrt->mfc_unres_queue) {
+ seq_printf(seq, " %8lu %8lu %8lu",
+ mfc->mfc_un.res.pkt,
+ mfc->mfc_un.res.bytes,
+ mfc->mfc_un.res.wrong_if);
+ for (n = mfc->mfc_un.res.minvif;
+ n < mfc->mfc_un.res.maxvif; n++) {
+ if (VIF_EXISTS(mrt, n) &&
+ mfc->mfc_un.res.ttls[n] < 255)
+ seq_printf(seq,
+ " %2d:%-3d",
+ n, mfc->mfc_un.res.ttls[n]);
+ }
+ } else {
+ /* unresolved mfc_caches don't contain
+ * pkt, bytes and wrong_if values
+ */
+ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+ }
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+ .start = ipmr_mfc_seq_start,
+ .next = ipmr_mfc_seq_next,
+ .stop = ipmr_mfc_seq_stop,
+ .show = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
+ sizeof(struct ipmr_mfc_iter));
+}
+
+static const struct file_operations ipmr_mfc_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_mfc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static const struct net_protocol pim_protocol = {
+ .handler = pim_rcv,
+ .netns_ok = 1,
+};
+#endif
+
+
+/*
+ * Setup for IP multicast routing
+ */
+static int __net_init ipmr_net_init(struct net *net)
+{
+ int err;
+
+ err = ipmr_rules_init(net);
+ if (err < 0)
+ goto fail;
+
+#ifdef CONFIG_PROC_FS
+ err = -ENOMEM;
+ if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
+ goto proc_vif_fail;
+ if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
+ goto proc_cache_fail;
+#endif
+ return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+ remove_proc_entry("ip_mr_vif", net->proc_net);
+proc_vif_fail:
+ ipmr_rules_exit(net);
+#endif
+fail:
+ return err;
+}
+
+static void __net_exit ipmr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_mr_cache", net->proc_net);
+ remove_proc_entry("ip_mr_vif", net->proc_net);
+#endif
+ ipmr_rules_exit(net);
+}
+
+static struct pernet_operations ipmr_net_ops = {
+ .init = ipmr_net_init,
+ .exit = ipmr_net_exit,
+};
+
+int __init ip_mr_init(void)
+{
+ int err;
+
+ mrt_cachep = kmem_cache_create("ip_mrt_cache",
+ sizeof(struct mfc_cache),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ NULL);
+ if (!mrt_cachep)
+ return -ENOMEM;
+
+ err = register_pernet_subsys(&ipmr_net_ops);
+ if (err)
+ goto reg_pernet_fail;
+
+ err = register_netdevice_notifier(&ip_mr_notifier);
+ if (err)
+ goto reg_notif_fail;
+#ifdef CONFIG_IP_PIMSM_V2
+ if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
+ pr_err("%s: can't add PIM protocol\n", __func__);
+ err = -EAGAIN;
+ goto add_proto_fail;
+ }
+#endif
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
+ NULL, ipmr_rtm_dumproute, NULL);
+ return 0;
+
+#ifdef CONFIG_IP_PIMSM_V2
+add_proto_fail:
+ unregister_netdevice_notifier(&ip_mr_notifier);
+#endif
+reg_notif_fail:
+ unregister_pernet_subsys(&ipmr_net_ops);
+reg_pernet_fail:
+ kmem_cache_destroy(mrt_cachep);
+ return err;
+}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 000000000..65de0684e
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,207 @@
+/*
+ * IPv4 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_queue.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ struct flowi4 fl4 = {};
+ __be32 saddr = iph->saddr;
+ __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ unsigned int hh_len;
+
+ if (addr_type == RTN_UNSPEC)
+ addr_type = inet_addr_type(net, saddr);
+ if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+ flags |= FLOWI_FLAG_ANYSRC;
+ else
+ saddr = 0;
+
+ /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+ * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
+ */
+ fl4.daddr = iph->daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_flags = flags;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ /* Drop old route. */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+
+ if (skb_dst(skb)->error)
+ return skb_dst(skb)->error;
+
+#ifdef CONFIG_XFRM
+ if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
+ struct dst_entry *dst = skb_dst(skb);
+ skb_dst_set(skb, NULL);
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ skb_dst_set(skb, dst);
+ }
+#endif
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+ __be32 daddr;
+ __be32 saddr;
+ u_int8_t tos;
+ u_int32_t mark;
+};
+
+static void nf_ip_saveroute(const struct sk_buff *skb,
+ struct nf_queue_entry *entry)
+{
+ struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ rt_info->tos = iph->tos;
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ rt_info->mark = skb->mark;
+ }
+}
+
+static int nf_ip_reroute(struct sk_buff *skb,
+ const struct nf_queue_entry *entry)
+{
+ const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(skb, RTN_UNSPEC);
+ }
+ return 0;
+}
+
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u_int8_t protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if ((protocol == 0 && !csum_fold(skb->csum)) ||
+ !csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - dataoff, protocol,
+ skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ if (protocol == 0)
+ skb->csum = 0;
+ else
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ skb->len - dataoff,
+ protocol, 0);
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u_int8_t protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+ skb->len - dataoff, 0);
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+}
+
+static int nf_ip_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict __always_unused)
+{
+ struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ *dst = &rt->dst;
+ return 0;
+}
+
+static const struct nf_afinfo nf_ip_afinfo = {
+ .family = AF_INET,
+ .checksum = nf_ip_checksum,
+ .checksum_partial = nf_ip_checksum_partial,
+ .route = nf_ip_route,
+ .saveroute = nf_ip_saveroute,
+ .reroute = nf_ip_reroute,
+ .route_key_size = sizeof(struct ip_rt_info),
+};
+
+static int __init ipv4_netfilter_init(void)
+{
+ return nf_register_afinfo(&nf_ip_afinfo);
+}
+
+static void __exit ipv4_netfilter_fini(void)
+{
+ nf_unregister_afinfo(&nf_ip_afinfo);
+}
+
+module_init(ipv4_netfilter_init);
+module_exit(ipv4_netfilter_fini);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 000000000..fb20f3631
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,406 @@
+#
+# IP netfilter configuration
+#
+
+menu "IP: Netfilter Configuration"
+ depends on INET && NETFILTER
+
+config NF_DEFRAG_IPV4
+ tristate
+ default n
+
+config NF_CONNTRACK_IPV4
+ tristate "IPv4 connection tracking support (required for NAT)"
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
+ select NF_DEFRAG_IPV4
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is IPv4 support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_PROC_COMPAT
+ bool "proc/sysctl compatibility with old connection tracking"
+ depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
+ default y
+ help
+ This option enables /proc and sysctl compatibility with the old
+ layer 3 dependent connection tracking. This is needed to keep
+ old programs that have not been adapted to the new names working.
+
+ If unsure, say Y.
+
+if NF_TABLES
+
+config NF_TABLES_IPV4
+ tristate "IPv4 nf_tables support"
+ help
+ This option enables the IPv4 support for nf_tables.
+
+if NF_TABLES_IPV4
+
+config NFT_CHAIN_ROUTE_IPV4
+ tristate "IPv4 nf_tables route chain support"
+ help
+ This option enables the "route" chain for IPv4 in nf_tables. This
+ chain type is used to force packet re-routing after mangling header
+ fields such as the source, destination, type of service and
+ the packet mark.
+
+config NFT_REJECT_IPV4
+ select NF_REJECT_IPV4
+ default NFT_REJECT
+ tristate
+
+endif # NF_TABLES_IPV4
+
+config NF_TABLES_ARP
+ tristate "ARP nf_tables support"
+ help
+ This option enables the ARP support for nf_tables.
+
+endif # NF_TABLES
+
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_REJECT_IPV4
+ tristate "IPv4 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
+config NF_NAT_IPV4
+ tristate "IPv4 NAT"
+ depends on NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ help
+ The IPv4 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables or nft.
+
+if NF_NAT_IPV4
+
+config NFT_CHAIN_NAT_IPV4
+ depends on NF_TABLES_IPV4
+ tristate "IPv4 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv4 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NF_NAT_MASQUERADE_IPV4
+ tristate "IPv4 masquerade support"
+ help
+ This is the kernel functionality to provide NAT in the masquerade
+ flavour (automatic source address selection).
+
+config NFT_MASQ_IPV4
+ tristate "IPv4 masquerading support for nf_tables"
+ depends on NF_TABLES_IPV4
+ depends on NFT_MASQ
+ select NF_NAT_MASQUERADE_IPV4
+ help
+ This is the expression that provides IPv4 masquerading support for
+ nf_tables.
+
+config NFT_REDIR_IPV4
+ tristate "IPv4 redirect support for nf_tables"
+ depends on NF_TABLES_IPV4
+ depends on NFT_REDIR
+ select NF_NAT_REDIRECT
+ help
+ This is the expression that provides IPv4 redirect support for
+ nf_tables.
+
+config NF_NAT_SNMP_BASIC
+ tristate "Basic SNMP-ALG support"
+ depends on NF_CONNTRACK_SNMP
+ depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
+ ---help---
+
+ This module implements an Application Layer Gateway (ALG) for
+ SNMP payloads. In conjunction with NAT, it allows a network
+ management system to access multiple private networks with
+ conflicting addresses. It works by modifying IP addresses
+ inside SNMP payloads to match IP-layer NAT mapping.
+
+ This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_NAT_PROTO_GRE
+ tristate
+ depends on NF_CT_PROTO_GRE
+
+config NF_NAT_PPTP
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_PPTP
+ select NF_NAT_PROTO_GRE
+
+config NF_NAT_H323
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_H323
+
+endif # NF_NAT_IPV4
+
+config IP_NF_IPTABLES
+ tristate "IP tables support (required for filtering/masq/NAT)"
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_XTABLES
+ help
+ iptables is a general, extensible packet identification framework.
+ The packet filtering and full NAT (masquerading, port forwarding,
+ etc) subsystems now use this: say `Y' or `M' here if you want to use
+ either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_NF_IPTABLES
+
+# The matches.
+config IP_NF_MATCH_AH
+ tristate '"ah" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of SPIs
+ inside AH header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_ECN
+ tristate '"ecn" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_ECN
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_ECN.
+
+config IP_NF_MATCH_RPFILTER
+ tristate '"rpfilter" reverse path filter match support'
+ depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ ---help---
+ This option allows you to match packets whose replies would
+ go out via the interface the packet came in.
+
+ To compile it as a module, choose M here. If unsure, say N.
+ The module will be called ipt_rpfilter.
+
+config IP_NF_MATCH_TTL
+ tristate '"ttl" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_HL
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
+
+# `filter', generic and specific targets
+config IP_NF_FILTER
+ tristate "Packet filtering"
+ default m if NETFILTER_ADVANCED=n
+ help
+ Packet filtering defines a table `filter', which has a series of
+ rules for simple packet filtering at local input, forwarding and
+ local output. See the man page for iptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_REJECT
+ tristate "REJECT target support"
+ depends on IP_NF_FILTER
+ select NF_REJECT_IPV4
+ default m if NETFILTER_ADVANCED=n
+ help
+ The REJECT target allows a filtering rule to specify that an ICMP
+ error should be issued in response to an incoming packet, rather
+ than silently being dropped.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_SYNPROXY
+ tristate "SYNPROXY target support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
+ help
+ The SYNPROXY target allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# NAT + specific targets: nf_conntrack
+config IP_NF_NAT
+ tristate "iptables NAT support"
+ depends on NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ select NF_NAT_IPV4
+ select NETFILTER_XT_NAT
+ help
+ This enables the `nat' table in iptables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_NF_NAT
+
+config IP_NF_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ select NF_NAT_MASQUERADE_IPV4
+ default m if NETFILTER_ADVANCED=n
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_NETMAP
+ tristate "NETMAP target support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_NETMAP
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_NETMAP.
+
+config IP_NF_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_REDIRECT
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_REDIRECT.
+
+endif # IP_NF_NAT
+
+# mangle + specific targets
+config IP_NF_MANGLE
+ tristate "Packet mangling"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option adds a `mangle' table to iptables: see the man page for
+ iptables(8). This table is used for various packet alterations
+ which can effect how the packet is routed.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_CLUSTERIP
+ tristate "CLUSTERIP target support"
+ depends on IP_NF_MANGLE
+ depends on NF_CONNTRACK_IPV4
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_MARK
+ help
+ The CLUSTERIP target allows you to build load-balancing clusters of
+ network servers without having a dedicated load-balancing
+ router/server/switch.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_ECN
+ tristate "ECN target support"
+ depends on IP_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a `ECN' target, which can be used in the iptables mangle
+ table.
+
+ You can use this target to remove the ECN bits from the IPv4 header of
+ an IP packet. This is particularly useful, if you need to work around
+ existing ECN blackholes on the internet, but don't want to disable
+ ECN support in general.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_TTL
+ tristate '"TTL" target support'
+ depends on NETFILTER_ADVANCED && IP_NF_MANGLE
+ select NETFILTER_XT_TARGET_HL
+ ---help---
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
+
+# raw + specific targets
+config IP_NF_RAW
+ tristate 'raw table support (required for NOTRACK/TRACE)'
+ help
+ This option adds a `raw' table to iptables. This table is the very
+ first in the netfilter framework and hooks in at the PREROUTING
+ and OUTPUT chains.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+# security table for MAC policy
+config IP_NF_SECURITY
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
+
+endif # IP_NF_IPTABLES
+
+# ARP tables
+config IP_NF_ARPTABLES
+ tristate "ARP tables support"
+ select NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ help
+ arptables is a general, extensible packet identification framework.
+ The ARP packet filtering and mangling (manipulation)subsystems
+ use this: say Y or M here if you want to use either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_NF_ARPTABLES
+
+config IP_NF_ARPFILTER
+ tristate "ARP packet filtering"
+ help
+ ARP packet filtering defines a table `filter', which has a series of
+ rules for simple ARP packet filtering at local input and
+ local output. On a bridge, you can also specify filtering rules
+ for forwarded ARP packets. See the man page for arptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_ARP_MANGLE
+ tristate "ARP payload mangling"
+ help
+ Allows altering the ARP packet payload: source and destination
+ hardware and network addresses.
+
+endif # IP_NF_ARPTABLES
+
+endmenu
+
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 000000000..7fe6c7035
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,72 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
+ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
+ifeq ($(CONFIG_PROC_FS),y)
+nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
+endif
+endif
+
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
+
+nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
+
+# defrag
+obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+
+# logging
+obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
+obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
+
+# reject
+obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
+
+# NAT helpers (nf_conntrack)
+obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
+obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
+obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
+
+# NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
+
+obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
+obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
+obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
+
+# generic IP tables
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
+
+# matches
+obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
+obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
+
+# targets
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
+
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 000000000..a61200754
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1926 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <net/compat.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x) WARN_ON(!(x))
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+
+void *arpt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(arpt, ARPT);
+}
+EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+ const char *hdr_addr, int len)
+{
+ int i, ret;
+
+ if (len > ARPT_DEV_ADDR_LEN_MAX)
+ len = ARPT_DEV_ADDR_LEN_MAX;
+
+ ret = 0;
+ for (i = 0; i < len; i++)
+ ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+ return ret != 0;
+}
+
+/*
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
+ * Some arches dont care, unrolling the loop is a win on them.
+ * For other arches, we only have a 16bit alignement.
+ */
+static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ unsigned long ret = ifname_compare_aligned(_a, _b, _mask);
+#else
+ unsigned long ret = 0;
+ const u16 *a = (const u16 *)_a;
+ const u16 *b = (const u16 *)_b;
+ const u16 *mask = (const u16 *)_mask;
+ int i;
+
+ for (i = 0; i < IFNAMSIZ/sizeof(u16); i++)
+ ret |= (a[i] ^ b[i]) & mask[i];
+#endif
+ return ret;
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+ struct net_device *dev,
+ const char *indev,
+ const char *outdev,
+ const struct arpt_arp *arpinfo)
+{
+ const char *arpptr = (char *)(arphdr + 1);
+ const char *src_devaddr, *tgt_devaddr;
+ __be32 src_ipaddr, tgt_ipaddr;
+ long ret;
+
+#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
+
+ if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+ ARPT_INV_ARPOP)) {
+ dprintf("ARP operation field mismatch.\n");
+ dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+ arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+ ARPT_INV_ARPHRD)) {
+ dprintf("ARP hardware address format mismatch.\n");
+ dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+ arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+ ARPT_INV_ARPPRO)) {
+ dprintf("ARP protocol address format mismatch.\n");
+ dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+ arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+ ARPT_INV_ARPHLN)) {
+ dprintf("ARP hardware address length mismatch.\n");
+ dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+ arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ return 0;
+ }
+
+ src_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ memcpy(&src_ipaddr, arpptr, sizeof(u32));
+ arpptr += sizeof(u32);
+ tgt_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+
+ if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+ ARPT_INV_SRCDEVADDR) ||
+ FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+ ARPT_INV_TGTDEVADDR)) {
+ dprintf("Source or target device address mismatch.\n");
+
+ return 0;
+ }
+
+ if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+ ARPT_INV_SRCIP) ||
+ FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+ ARPT_INV_TGTIP)) {
+ dprintf("Source or target IP address mismatch.\n");
+
+ dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
+ &src_ipaddr,
+ &arpinfo->smsk.s_addr,
+ &arpinfo->src.s_addr,
+ arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n",
+ &tgt_ipaddr,
+ &arpinfo->tmsk.s_addr,
+ &arpinfo->tgt.s_addr,
+ arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Look for ifname matches. */
+ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, arpinfo->iniface,
+ arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ return 0;
+ }
+
+ ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, arpinfo->outiface,
+ arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ return 0;
+ }
+
+ return 1;
+#undef FWINV
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+ if (arp->flags & ~ARPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ arp->flags & ~ARPT_F_MASK);
+ return 0;
+ }
+ if (arp->invflags & ~ARPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ arp->invflags & ~ARPT_INV_MASK);
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned int
+arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ net_err_ratelimited("arp_tables: error: '%s'\n",
+ (const char *)par->targinfo);
+
+ return NF_DROP;
+}
+
+static inline const struct xt_entry_target *
+arpt_get_target_c(const struct arpt_entry *e)
+{
+ return arpt_get_target((struct arpt_entry *)e);
+}
+
+static inline struct arpt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct arpt_entry *)(base + offset);
+}
+
+static inline __pure
+struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+unsigned int arpt_do_table(struct sk_buff *skb,
+ unsigned int hook,
+ const struct nf_hook_state *state,
+ struct xt_table *table)
+{
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+ unsigned int verdict = NF_DROP;
+ const struct arphdr *arp;
+ struct arpt_entry *e, *back;
+ const char *indev, *outdev;
+ void *table_base;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
+
+ if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
+ return NF_DROP;
+
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
+
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
+ table_base = private->entries[smp_processor_id()];
+
+ e = get_entry(table_base, private->hook_entry[hook]);
+ back = get_entry(table_base, private->underflow[hook]);
+
+ acpar.in = state->in;
+ acpar.out = state->out;
+ acpar.hooknum = hook;
+ acpar.family = NFPROTO_ARP;
+ acpar.hotdrop = false;
+
+ arp = arp_hdr(skb);
+ do {
+ const struct xt_entry_target *t;
+
+ if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
+ e = arpt_next_entry(e);
+ continue;
+ }
+
+ ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+
+ t = arpt_get_target_c(e);
+
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
+ }
+ e = back;
+ back = get_entry(table_base, back->comefrom);
+ continue;
+ }
+ if (table_base + v
+ != arpt_next_entry(e)) {
+ /* Save old back ptr in next entry */
+ struct arpt_entry *next = arpt_next_entry(e);
+ next->comefrom = (void *)back - table_base;
+
+ /* set back pointer to next entry */
+ back = next;
+ }
+
+ e = get_entry(table_base, v);
+ continue;
+ }
+
+ /* Targets which reenter must return
+ * abs. verdicts
+ */
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+ verdict = t->u.kernel.target->target(skb, &acpar);
+
+ /* Target might have changed stuff. */
+ arp = arp_hdr(skb);
+
+ if (verdict == XT_CONTINUE)
+ e = arpt_next_entry(e);
+ else
+ /* Verdict */
+ break;
+ } while (!acpar.hotdrop);
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else
+ return verdict;
+}
+
+/* All zeroes == unconditional rule. */
+static inline bool unconditional(const struct arpt_arp *arp)
+{
+ static const struct arpt_arp uncond;
+
+ return memcmp(arp, &uncond, sizeof(uncond)) == 0;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops. Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ * to 0 as we leave), and comefrom to save source hook bitmask.
+ */
+ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct arpt_entry *e
+ = (struct arpt_entry *)(entry0 + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ const struct xt_standard_target *t
+ = (void *)arpt_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
+
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+ pr_notice("arptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom
+ |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if ((e->target_offset == sizeof(struct arpt_entry) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 && unconditional(&e->arp)) ||
+ visited) {
+ unsigned int oldpos, size;
+
+ if ((strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("mark_source_chains: bad "
+ "negative verdict (%i)\n",
+ t->verdict);
+ return 0;
+ }
+
+ /* Return: backtrack through the last
+ * big jump.
+ */
+ do {
+ e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct arpt_entry *)
+ (entry0 + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct arpt_entry *)
+ (entry0 + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
+ if (newpos > newinfo->size -
+ sizeof(struct arpt_entry)) {
+ duprintf("mark_source_chains: "
+ "bad verdict (%i)\n",
+ newpos);
+ return 0;
+ }
+
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct arpt_entry *)
+ (entry0 + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static inline int check_entry(const struct arpt_entry *e, const char *name)
+{
+ const struct xt_entry_target *t;
+
+ if (!arp_checkentry(&e->arp)) {
+ duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
+ return -EINVAL;
+
+ t = arpt_get_target_c(e);
+ if (e->target_offset + t->u.target_size > e->next_offset)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline int check_target(struct arpt_entry *e, const char *name)
+{
+ struct xt_entry_target *t = arpt_get_target(e);
+ int ret;
+ struct xt_tgchk_param par = {
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_ARP,
+ };
+
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+ if (ret < 0) {
+ duprintf("arp_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ return ret;
+ }
+ return 0;
+}
+
+static inline int
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ int ret;
+
+ ret = check_entry(e, name);
+ if (ret)
+ return ret;
+
+ t = arpt_get_target(e);
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+ ret = PTR_ERR(target);
+ goto out;
+ }
+ t->u.kernel.target = target;
+
+ ret = check_target(e, name);
+ if (ret)
+ goto err;
+ return 0;
+err:
+ module_put(t->u.kernel.target->me);
+out:
+ return ret;
+}
+
+static bool check_underflow(const struct arpt_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(&e->arp))
+ return false;
+ t = arpt_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+ struct xt_table_info *newinfo,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int valid_hooks)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e)) {
+ pr_err("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
+ return -EINVAL;
+ }
+ newinfo->underflow[h] = underflows[h];
+ }
+ }
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct xt_counters) { 0, 0 });
+ e->comefrom = 0;
+ return 0;
+}
+
+static inline void cleanup_entry(struct arpt_entry *e)
+{
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+
+ t = arpt_get_target(e);
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_ARP;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(struct xt_table_info *newinfo, void *entry0,
+ const struct arpt_replace *repl)
+{
+ struct arpt_entry *iter;
+ unsigned int i;
+ int ret = 0;
+
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(arpt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+ if (ret != 0)
+ return ret;
+
+ if (i != repl->num_entries) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, repl->num_entries);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(repl->valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, repl->hook_entry[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, repl->underflow[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
+ duprintf("Looping hook\n");
+ return -ELOOP;
+ }
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
+
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter);
+ }
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i) {
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
+ }
+
+ return ret;
+}
+
+static void get_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct arpt_entry *iter;
+ unsigned int cpu;
+ unsigned int i;
+
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+ i = 0;
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i;
+ }
+ }
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+ unsigned int countersize;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ * (other than comefrom, which userspace doesn't care
+ * about).
+ */
+ countersize = sizeof(struct xt_counters) * private->number;
+ counters = vzalloc(countersize);
+
+ if (counters == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ get_counters(private, counters);
+
+ return counters;
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct arpt_entry *e;
+ struct xt_counters *counters;
+ struct xt_table_info *private = table->private;
+ int ret = 0;
+ void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ const struct xt_entry_target *t;
+
+ e = (struct arpt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct arpt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ t = arpt_get_target_c(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct xt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(NFPROTO_ARP, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct arpt_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - base;
+
+ t = arpt_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct arpt_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct arpt_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct arpt_entry *iter;
+ void *loc_cpu_entry;
+ int ret;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(NFPROTO_ARP, info->number);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct arpt_getinfo)) {
+ duprintf("length %u != %Zu\n", *len,
+ sizeof(struct arpt_getinfo));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_lock(NFPROTO_ARP);
+#endif
+ t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
+ "arptable_%s", name);
+ if (!IS_ERR_OR_NULL(t)) {
+ struct arpt_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
+ if (compat) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ private = &tmp;
+ }
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_unlock(NFPROTO_ARP);
+#endif
+ return ret;
+}
+
+static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
+ const int *len)
+{
+ int ret;
+ struct arpt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct arpt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %Zu\n", *len,
+ sizeof(struct arpt_get_entries) + get.size);
+ return -EINVAL;
+ }
+
+ t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+
+ duprintf("t->private->number = %u\n",
+ private->number);
+ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ return ret;
+}
+
+static int __do_replace(struct net *net, const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info *newinfo,
+ unsigned int num_counters,
+ void __user *counters_ptr)
+{
+ int ret;
+ struct xt_table *t;
+ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+ void *loc_cpu_old_entry;
+ struct arpt_entry *iter;
+
+ ret = 0;
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
+ "arptable_%s", name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+ if (valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters, and synchronize with replace */
+ get_counters(oldinfo, counters);
+
+ /* Decrease module usage counts and free resource */
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter);
+
+ xt_free_table_info(oldinfo);
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+ }
+ vfree(counters);
+ xt_table_unlock(t);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+ vfree(counters);
+ out:
+ return ret;
+}
+
+static int do_replace(struct net *net, const void __user *user,
+ unsigned int len)
+{
+ int ret;
+ struct arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_table(newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("arp_tables: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int do_add_counters(struct net *net, const void __user *user,
+ unsigned int len, int compat)
+{
+ unsigned int i, curcpu;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ unsigned int num_counters;
+ const char *name;
+ int size;
+ void *ptmp;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+ unsigned int addend;
+#ifdef CONFIG_COMPAT
+ struct compat_xt_counters_info compat_tmp;
+
+ if (compat) {
+ ptmp = &compat_tmp;
+ size = sizeof(struct compat_xt_counters_info);
+ } else
+#endif
+ {
+ ptmp = &tmp;
+ size = sizeof(struct xt_counters_info);
+ }
+
+ if (copy_from_user(ptmp, user, size) != 0)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ num_counters = compat_tmp.num_counters;
+ name = compat_tmp.name;
+ } else
+#endif
+ {
+ num_counters = tmp.num_counters;
+ name = tmp.name;
+ }
+
+ if (len != size + num_counters * sizeof(struct xt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len - size);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user + size, len - size) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = xt_find_table_lock(net, NFPROTO_ARP, name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+ }
+
+ local_bh_disable();
+ private = t->private;
+ if (private->number != num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ /* Choose the copy that is on our node */
+ curcpu = smp_processor_id();
+ loc_cpu_entry = private->entries[curcpu];
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
+ unlock_up_free:
+ local_bh_enable();
+ xt_table_unlock(t);
+ module_put(t->me);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static inline void compat_release_entry(struct compat_arpt_entry *e)
+{
+ struct xt_entry_target *t;
+
+ t = compat_arpt_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static inline int
+check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ const char *name)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ int ret, off, h;
+
+ duprintf("check_compat_entry_size_and_hooks %p\n", e);
+ if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+ duprintf("Bad offset %p, limit = %p\n", e, limit);
+ return -EINVAL;
+ }
+
+ if (e->next_offset < sizeof(struct compat_arpt_entry) +
+ sizeof(struct compat_xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct arpt_entry *)e, name);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - (void *)base;
+
+ t = compat_arpt_get_target(e);
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+ t->u.user.name);
+ ret = PTR_ERR(target);
+ goto out;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+ if (ret)
+ goto release_target;
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* Clear counters and comefrom */
+ memset(&e->counters, 0, sizeof(e->counters));
+ e->comefrom = 0;
+ return 0;
+
+release_target:
+ module_put(t->u.kernel.target->me);
+out:
+ return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ struct arpt_entry *de;
+ unsigned int origsize;
+ int ret, h;
+
+ ret = 0;
+ origsize = *size;
+ de = (struct arpt_entry *)*dstptr;
+ memcpy(de, e, sizeof(struct arpt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct arpt_entry);
+ *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_arpt_get_target(e);
+ target = t->u.kernel.target;
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+ return ret;
+}
+
+static int translate_compat_table(const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_arpt_entry *iter0;
+ struct arpt_entry *iter1;
+ unsigned int size;
+ int ret = 0;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = total_size;
+ info->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ info->hook_entry[i] = 0xFFFFFFFF;
+ info->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_compat_table: size %u\n", info->size);
+ j = 0;
+ xt_compat_lock(NFPROTO_ARP);
+ xt_compat_init_offsets(NFPROTO_ARP, number);
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != number) {
+ duprintf("translate_compat_table: %u not %u entries\n",
+ j, number);
+ goto out_unlock;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (info->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ goto out_unlock;
+ }
+ if (info->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ goto out_unlock;
+ }
+ }
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ newinfo->number = number;
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = info->hook_entry[i];
+ newinfo->underflow[i] = info->underflow[i];
+ }
+ entry1 = newinfo->entries[raw_smp_processor_id()];
+ pos = entry1;
+ size = total_size;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ xt_compat_unlock(NFPROTO_ARP);
+ if (ret)
+ goto free_newinfo;
+
+ ret = -ELOOP;
+ if (!mark_source_chains(newinfo, valid_hooks, entry1))
+ goto free_newinfo;
+
+ i = 0;
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = check_target(iter1, name);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(arpt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
+ j -= i;
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1);
+ }
+ xt_free_table_info(newinfo);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i)
+ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+ memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+out:
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ xt_compat_unlock(NFPROTO_ARP);
+ goto out;
+}
+
+struct compat_arpt_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_ARP_NUMHOOKS];
+ u32 underflow[NF_ARP_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters;
+ struct compat_arpt_entry entries[0];
+};
+
+static int compat_do_replace(struct net *net, void __user *user,
+ unsigned int len)
+{
+ int ret;
+ struct compat_arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.size >= INT_MAX / num_possible_cpus())
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(tmp.name, tmp.valid_hooks,
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("compat_do_replace: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
+ unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = compat_do_replace(sock_net(sk), user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 1);
+ break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
+ compat_uint_t *size,
+ struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_arpt_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ int ret;
+
+ origsize = *size;
+ ce = (struct compat_arpt_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_arpt_entry);
+ *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ target_offset = e->target_offset - (origsize - *size);
+
+ t = arpt_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_copy_entries_to_user(unsigned int total_size,
+ struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ void *loc_cpu_entry;
+ unsigned int i = 0;
+ struct arpt_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+ vfree(counters);
+ return ret;
+}
+
+struct compat_arpt_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_arpt_entry entrytable[0];
+};
+
+static int compat_get_entries(struct net *net,
+ struct compat_arpt_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_arpt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct compat_arpt_get_entries) + get.size) {
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ xt_compat_lock(NFPROTO_ARP);
+ t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+
+ duprintf("t->private->number = %u\n", private->number);
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret) {
+ duprintf("compat_get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ xt_compat_unlock(NFPROTO_ARP);
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
+ int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 1);
+ break;
+ case ARPT_SO_GET_ENTRIES:
+ ret = compat_get_entries(sock_net(sk), user, len);
+ break;
+ default:
+ ret = do_arpt_get_ctl(sk, cmd, user, len);
+ }
+ return ret;
+}
+#endif
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = do_replace(sock_net(sk), user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 0);
+ break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 0);
+ break;
+
+ case ARPT_SO_GET_ENTRIES:
+ ret = get_entries(sock_net(sk), user, len);
+ break;
+
+ case ARPT_SO_GET_REVISION_TARGET: {
+ struct xt_get_revision rev;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ rev.name[sizeof(rev.name)-1] = 0;
+
+ try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
+ rev.revision, 1, &ret),
+ "arpt_%s", rev.name);
+ break;
+ }
+
+ default:
+ duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct xt_table *arpt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct arpt_replace *repl)
+{
+ int ret;
+ struct xt_table_info *newinfo;
+ struct xt_table_info bootstrap = {0};
+ void *loc_cpu_entry;
+ struct xt_table *new_table;
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+ ret = translate_table(newinfo, loc_cpu_entry, repl);
+ duprintf("arpt_register_table: translate table gives %d\n", ret);
+ if (ret != 0)
+ goto out_free;
+
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ ret = PTR_ERR(new_table);
+ goto out_free;
+ }
+ return new_table;
+
+out_free:
+ xt_free_table_info(newinfo);
+out:
+ return ERR_PTR(ret);
+}
+
+void arpt_unregister_table(struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct arpt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct xt_target arpt_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_ARP,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = arpt_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_ARP,
+ },
+};
+
+static struct nf_sockopt_ops arpt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = ARPT_BASE_CTL,
+ .set_optmax = ARPT_SO_SET_MAX+1,
+ .set = do_arpt_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_arpt_set_ctl,
+#endif
+ .get_optmin = ARPT_BASE_CTL,
+ .get_optmax = ARPT_SO_GET_MAX+1,
+ .get = do_arpt_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_arpt_get_ctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+static int __net_init arp_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_ARP);
+}
+
+static void __net_exit arp_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_ARP);
+}
+
+static struct pernet_operations arp_tables_net_ops = {
+ .init = arp_tables_net_init,
+ .exit = arp_tables_net_exit,
+};
+
+static int __init arp_tables_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&arp_tables_net_ops);
+ if (ret < 0)
+ goto err1;
+
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+ if (ret < 0)
+ goto err2;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&arpt_sockopts);
+ if (ret < 0)
+ goto err4;
+
+ printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n");
+ return 0;
+
+err4:
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+err2:
+ unregister_pernet_subsys(&arp_tables_net_ops);
+err1:
+ return ret;
+}
+
+static void __exit arp_tables_fini(void)
+{
+ nf_unregister_sockopt(&arpt_sockopts);
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+ unregister_pernet_subsys(&arp_tables_net_ops);
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+
+module_init(arp_tables_init);
+module_exit(arp_tables_fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 000000000..a5e52a9f0
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,91 @@
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+
+static unsigned int
+target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct arpt_mangle *mangle = par->targinfo;
+ const struct arphdr *arp;
+ unsigned char *arpptr;
+ int pln, hln;
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ arp = arp_hdr(skb);
+ arpptr = skb_network_header(skb) + sizeof(*arp);
+ pln = arp->ar_pln;
+ hln = arp->ar_hln;
+ /* We assume that pln and hln were checked in the match */
+ if (mangle->flags & ARPT_MANGLE_SDEV) {
+ if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+ (arpptr + hln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, mangle->src_devaddr, hln);
+ }
+ arpptr += hln;
+ if (mangle->flags & ARPT_MANGLE_SIP) {
+ if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+ (arpptr + pln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, &mangle->u_s.src_ip, pln);
+ }
+ arpptr += pln;
+ if (mangle->flags & ARPT_MANGLE_TDEV) {
+ if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+ (arpptr + hln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, mangle->tgt_devaddr, hln);
+ }
+ arpptr += hln;
+ if (mangle->flags & ARPT_MANGLE_TIP) {
+ if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+ (arpptr + pln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+ }
+ return mangle->target;
+}
+
+static int checkentry(const struct xt_tgchk_param *par)
+{
+ const struct arpt_mangle *mangle = par->targinfo;
+
+ if (mangle->flags & ~ARPT_MANGLE_MASK ||
+ !(mangle->flags & ARPT_MANGLE_MASK))
+ return -EINVAL;
+
+ if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+ mangle->target != XT_CONTINUE)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target arpt_mangle_reg __read_mostly = {
+ .name = "mangle",
+ .family = NFPROTO_ARP,
+ .target = target,
+ .targetsize = sizeof(struct arpt_mangle),
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init arpt_mangle_init(void)
+{
+ return xt_register_target(&arpt_mangle_reg);
+}
+
+static void __exit arpt_mangle_fini(void)
+{
+ xt_unregister_target(&arpt_mangle_reg);
+}
+
+module_init(arpt_mangle_init);
+module_exit(arpt_mangle_fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 000000000..93876d031
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,91 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+ (1 << NF_ARP_FORWARD))
+
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_ARP,
+ .priority = NF_IP_PRI_FILTER,
+};
+
+/* The work comes in here from netfilter.c */
+static unsigned int
+arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net = dev_net(state->in ? state->in : state->out);
+
+ return arpt_do_table(skb, ops->hooknum, state,
+ net->ipv4.arptable_filter);
+}
+
+static struct nf_hook_ops *arpfilter_ops __read_mostly;
+
+static int __net_init arptable_filter_net_init(struct net *net)
+{
+ struct arpt_replace *repl;
+
+ repl = arpt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.arptable_filter =
+ arpt_register_table(net, &packet_filter, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+}
+
+static void __net_exit arptable_filter_net_exit(struct net *net)
+{
+ arpt_unregister_table(net->ipv4.arptable_filter);
+}
+
+static struct pernet_operations arptable_filter_net_ops = {
+ .init = arptable_filter_net_init,
+ .exit = arptable_filter_net_exit,
+};
+
+static int __init arptable_filter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&arptable_filter_net_ops);
+ if (ret < 0)
+ return ret;
+
+ arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
+ if (IS_ERR(arpfilter_ops)) {
+ ret = PTR_ERR(arpfilter_ops);
+ goto cleanup_table;
+ }
+ return ret;
+
+cleanup_table:
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+ return ret;
+}
+
+static void __exit arptable_filter_fini(void)
+{
+ xt_hook_unlink(&packet_filter, arpfilter_ops);
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+}
+
+module_init(arptable_filter_init);
+module_exit(arptable_filter_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 000000000..2d0e265fe
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,2282 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cache.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <net/compat.h>
+#include <asm/uaccess.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) pr_info(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) pr_info(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x) WARN_ON(!(x))
+#else
+#define IP_NF_ASSERT(x)
+#endif
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+void *ipt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(ipt, IPT);
+}
+EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
+
+/* Returns whether matches rule or not. */
+/* Performance critical - called for every packet */
+static inline bool
+ip_packet_match(const struct iphdr *ip,
+ const char *indev,
+ const char *outdev,
+ const struct ipt_ip *ipinfo,
+ int isfrag)
+{
+ unsigned long ret;
+
+#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
+
+ if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+ IPT_INV_SRCIP) ||
+ FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+ IPT_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+
+ dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
+ &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr,
+ ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n",
+ &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr,
+ ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+ return false;
+ }
+
+ ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
+
+ if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, ipinfo->iniface,
+ ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ return false;
+ }
+
+ ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
+
+ if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, ipinfo->outiface,
+ ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ return false;
+ }
+
+ /* Check specific protocol */
+ if (ipinfo->proto &&
+ FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+ dprintf("Packet protocol %hi does not match %hi.%s\n",
+ ip->protocol, ipinfo->proto,
+ ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ return false;
+ }
+
+ /* If we have a fragment rule but the packet is not a fragment
+ * then we return zero */
+ if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+ dprintf("Fragment rule but not fragment.%s\n",
+ ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+ip_checkentry(const struct ipt_ip *ip)
+{
+ if (ip->flags & ~IPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ ip->flags & ~IPT_F_MASK);
+ return false;
+ }
+ if (ip->invflags & ~IPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ ip->invflags & ~IPT_INV_MASK);
+ return false;
+ }
+ return true;
+}
+
+static unsigned int
+ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
+
+ return NF_DROP;
+}
+
+/* Performance critical */
+static inline struct ipt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct ipt_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ipt_ip *ip)
+{
+ static const struct ipt_ip uncond;
+
+ return memcmp(ip, &uncond, sizeof(uncond)) == 0;
+#undef FWINV
+}
+
+/* for const-correctness */
+static inline const struct xt_entry_target *
+ipt_get_target_c(const struct ipt_entry *e)
+{
+ return ipt_get_target((struct ipt_entry *)e);
+}
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+static const char *const hooknames[] = {
+ [NF_INET_PRE_ROUTING] = "PREROUTING",
+ [NF_INET_LOCAL_IN] = "INPUT",
+ [NF_INET_FORWARD] = "FORWARD",
+ [NF_INET_LOCAL_OUT] = "OUTPUT",
+ [NF_INET_POST_ROUTING] = "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+ NF_IP_TRACE_COMMENT_RULE,
+ NF_IP_TRACE_COMMENT_RETURN,
+ NF_IP_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+ [NF_IP_TRACE_COMMENT_RULE] = "rule",
+ [NF_IP_TRACE_COMMENT_RETURN] = "return",
+ [NF_IP_TRACE_COMMENT_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 4,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
+ const char *hookname, const char **chainname,
+ const char **comment, unsigned int *rulenum)
+{
+ const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
+
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+ /* Head of user chain: ERROR target with chainname */
+ *chainname = t->target.data;
+ (*rulenum) = 0;
+ } else if (s == e) {
+ (*rulenum)++;
+
+ if (s->target_offset == sizeof(struct ipt_entry) &&
+ strcmp(t->target.u.kernel.target->name,
+ XT_STANDARD_TARGET) == 0 &&
+ t->verdict < 0 &&
+ unconditional(&s->ip)) {
+ /* Tail of chains: STANDARD target (return/policy) */
+ *comment = *chainname == hookname
+ ? comments[NF_IP_TRACE_COMMENT_POLICY]
+ : comments[NF_IP_TRACE_COMMENT_RETURN];
+ }
+ return 1;
+ } else
+ (*rulenum)++;
+
+ return 0;
+}
+
+static void trace_packet(const struct sk_buff *skb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *tablename,
+ const struct xt_table_info *private,
+ const struct ipt_entry *e)
+{
+ const void *table_base;
+ const struct ipt_entry *root;
+ const char *hookname, *chainname, *comment;
+ const struct ipt_entry *iter;
+ unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
+
+ table_base = private->entries[smp_processor_id()];
+ root = get_entry(table_base, private->hook_entry[hook]);
+
+ hookname = chainname = hooknames[hook];
+ comment = comments[NF_IP_TRACE_COMMENT_RULE];
+
+ xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+ if (get_chainname_rulenum(iter, e, hookname,
+ &chainname, &comment, &rulenum) != 0)
+ break;
+
+ nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline __pure
+struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff *skb,
+ unsigned int hook,
+ const struct nf_hook_state *state,
+ struct xt_table *table)
+{
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+ const struct iphdr *ip;
+ /* Initializing verdict to NF_DROP keeps gcc happy. */
+ unsigned int verdict = NF_DROP;
+ const char *indev, *outdev;
+ const void *table_base;
+ struct ipt_entry *e, **jumpstack;
+ unsigned int *stackptr, origptr, cpu;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
+
+ /* Initialization */
+ ip = ip_hdr(skb);
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+ acpar.thoff = ip_hdrlen(skb);
+ acpar.hotdrop = false;
+ acpar.in = state->in;
+ acpar.out = state->out;
+ acpar.family = NFPROTO_IPV4;
+ acpar.hooknum = hook;
+
+ IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = table->private;
+ cpu = smp_processor_id();
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
+ table_base = private->entries[cpu];
+ jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
+ stackptr = per_cpu_ptr(private->stackptr, cpu);
+ origptr = *stackptr;
+
+ e = get_entry(table_base, private->hook_entry[hook]);
+
+ pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
+ table->name, hook, origptr,
+ get_entry(table_base, private->underflow[hook]));
+
+ do {
+ const struct xt_entry_target *t;
+ const struct xt_entry_match *ematch;
+
+ IP_NF_ASSERT(e);
+ if (!ip_packet_match(ip, indev, outdev,
+ &e->ip, acpar.fragoff)) {
+ no_match:
+ e = ipt_next_entry(e);
+ continue;
+ }
+
+ xt_ematch_foreach(ematch, e) {
+ acpar.match = ematch->u.kernel.match;
+ acpar.matchinfo = ematch->data;
+ if (!acpar.match->match(skb, &acpar))
+ goto no_match;
+ }
+
+ ADD_COUNTER(e->counters, skb->len, 1);
+
+ t = ipt_get_target(e);
+ IP_NF_ASSERT(t->u.kernel.target);
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+ /* The packet is traced: log it */
+ if (unlikely(skb->nf_trace))
+ trace_packet(skb, hook, state->in, state->out,
+ table->name, private, e);
+#endif
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
+ }
+ if (*stackptr <= origptr) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ pr_debug("Underflow (this is normal) "
+ "to %p\n", e);
+ } else {
+ e = jumpstack[--*stackptr];
+ pr_debug("Pulled %p out from pos %u\n",
+ e, *stackptr);
+ e = ipt_next_entry(e);
+ }
+ continue;
+ }
+ if (table_base + v != ipt_next_entry(e) &&
+ !(e->ip.flags & IPT_F_GOTO)) {
+ if (*stackptr >= private->stacksize) {
+ verdict = NF_DROP;
+ break;
+ }
+ jumpstack[(*stackptr)++] = e;
+ pr_debug("Pushed %p into pos %u\n",
+ e, *stackptr - 1);
+ }
+
+ e = get_entry(table_base, v);
+ continue;
+ }
+
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+
+ verdict = t->u.kernel.target->target(skb, &acpar);
+ /* Target might have changed stuff. */
+ ip = ip_hdr(skb);
+ if (verdict == XT_CONTINUE)
+ e = ipt_next_entry(e);
+ else
+ /* Verdict */
+ break;
+ } while (!acpar.hotdrop);
+ pr_debug("Exiting %s; resetting sp from %u to %u\n",
+ __func__, *stackptr, origptr);
+ *stackptr = origptr;
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
+#ifdef DEBUG_ALLOW_ALL
+ return NF_ACCEPT;
+#else
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else return verdict;
+#endif
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ there are loops. Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ to 0 as we leave), and comefrom to save source hook bitmask */
+ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ const struct xt_standard_target *t
+ = (void *)ipt_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
+
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
+ pr_err("iptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if ((e->target_offset == sizeof(struct ipt_entry) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 && unconditional(&e->ip)) ||
+ visited) {
+ unsigned int oldpos, size;
+
+ if ((strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("mark_source_chains: bad "
+ "negative verdict (%i)\n",
+ t->verdict);
+ return 0;
+ }
+
+ /* Return: backtrack through the last
+ big jump. */
+ do {
+ e->comefrom ^= (1<<NF_INET_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+ if (e->comefrom
+ & (1 << NF_INET_NUMHOOKS)) {
+ duprintf("Back unset "
+ "on hook %u "
+ "rule %u\n",
+ hook, pos);
+ }
+#endif
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct ipt_entry *)
+ (entry0 + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct ipt_entry *)
+ (entry0 + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
+ if (newpos > newinfo->size -
+ sizeof(struct ipt_entry)) {
+ duprintf("mark_source_chains: "
+ "bad verdict (%i)\n",
+ newpos);
+ return 0;
+ }
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct ipt_entry *)
+ (entry0 + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
+{
+ struct xt_mtdtor_param par;
+
+ par.net = net;
+ par.match = m->u.kernel.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_IPV4;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
+}
+
+static int
+check_entry(const struct ipt_entry *e, const char *name)
+{
+ const struct xt_entry_target *t;
+
+ if (!ip_checkentry(&e->ip)) {
+ duprintf("ip check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ if (e->target_offset + sizeof(struct xt_entry_target) >
+ e->next_offset)
+ return -EINVAL;
+
+ t = ipt_get_target_c(e);
+ if (e->target_offset + t->u.target_size > e->next_offset)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+ const struct ipt_ip *ip = par->entryinfo;
+ int ret;
+
+ par->match = m->u.kernel.match;
+ par->matchinfo = m->data;
+
+ ret = xt_check_match(par, m->u.match_size - sizeof(*m),
+ ip->proto, ip->invflags & IPT_INV_PROTO);
+ if (ret < 0) {
+ duprintf("check failed for `%s'.\n", par->match->name);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+ struct xt_match *match;
+ int ret;
+
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
+ duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+ return PTR_ERR(match);
+ }
+ m->u.kernel.match = match;
+
+ ret = check_match(m, par);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ module_put(m->u.kernel.match->me);
+ return ret;
+}
+
+static int check_target(struct ipt_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_target *t = ipt_get_target(e);
+ struct xt_tgchk_param par = {
+ .net = net,
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_IPV4,
+ };
+ int ret;
+
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
+ if (ret < 0) {
+ duprintf("check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
+ unsigned int size)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ int ret;
+ unsigned int j;
+ struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
+
+ ret = check_entry(e, name);
+ if (ret)
+ return ret;
+
+ j = 0;
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ip;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV4;
+ xt_ematch_foreach(ematch, e) {
+ ret = find_check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
+
+ t = ipt_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+ ret = PTR_ERR(target);
+ goto cleanup_matches;
+ }
+ t->u.kernel.target = target;
+
+ ret = check_target(e, net, name);
+ if (ret)
+ goto err;
+ return 0;
+ err:
+ module_put(t->u.kernel.target->me);
+ cleanup_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+ return ret;
+}
+
+static bool check_underflow(const struct ipt_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(&e->ip))
+ return false;
+ t = ipt_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
+check_entry_size_and_hooks(struct ipt_entry *e,
+ struct xt_table_info *newinfo,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int valid_hooks)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e)) {
+ pr_err("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
+ return -EINVAL;
+ }
+ newinfo->underflow[h] = underflows[h];
+ }
+ }
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct xt_counters) { 0, 0 });
+ e->comefrom = 0;
+ return 0;
+}
+
+static void
+cleanup_entry(struct ipt_entry *e, struct net *net)
+{
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ cleanup_match(ematch, net);
+ t = ipt_get_target(e);
+
+ par.net = net;
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_IPV4;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ newinfo) */
+static int
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+ const struct ipt_replace *repl)
+{
+ struct ipt_entry *iter;
+ unsigned int i;
+ int ret = 0;
+
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ return ret;
+ ++i;
+ if (strcmp(ipt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+
+ if (i != repl->num_entries) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, repl->num_entries);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(repl->valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, repl->hook_entry[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, repl->underflow[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
+ return -ELOOP;
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
+
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i) {
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
+ }
+
+ return ret;
+}
+
+static void
+get_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ipt_entry *iter;
+ unsigned int cpu;
+ unsigned int i;
+
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+ i = 0;
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i; /* macro does multi eval of i */
+ }
+ }
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+ unsigned int countersize;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ (other than comefrom, which userspace doesn't care
+ about). */
+ countersize = sizeof(struct xt_counters) * private->number;
+ counters = vzalloc(countersize);
+
+ if (counters == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ get_counters(private, counters);
+
+ return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct ipt_entry *e;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ int ret = 0;
+ const void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ unsigned int i;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
+
+ e = (struct ipt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct ipt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ for (i = sizeof(struct ipt_entry);
+ i < e->target_offset;
+ i += m->u.match_size) {
+ m = (void *)e + i;
+
+ if (copy_to_user(userptr + off + i
+ + offsetof(struct xt_entry_match,
+ u.user.name),
+ m->u.kernel.match->name,
+ strlen(m->u.kernel.match->name)+1)
+ != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ t = ipt_get_target_c(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct xt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(AF_INET, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(AF_INET, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ipt_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_match *ematch;
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+ entry_offset = (void *)e - base;
+ xt_ematch_foreach(ematch, e)
+ off += xt_compat_match_offset(ematch->u.kernel.match);
+ t = ipt_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct ipt_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct ipt_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct ipt_entry *iter;
+ void *loc_cpu_entry;
+ int ret;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET, info->number);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct ipt_getinfo)) {
+ duprintf("length %u != %zu\n", *len,
+ sizeof(struct ipt_getinfo));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_lock(AF_INET);
+#endif
+ t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+ "iptable_%s", name);
+ if (!IS_ERR_OR_NULL(t)) {
+ struct ipt_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
+ if (compat) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(AF_INET);
+ private = &tmp;
+ }
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_unlock(AF_INET);
+#endif
+ return ret;
+}
+
+static int
+get_entries(struct net *net, struct ipt_get_entries __user *uptr,
+ const int *len)
+{
+ int ret;
+ struct ipt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct ipt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ t = xt_find_table_lock(net, AF_INET, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ duprintf("t->private->number = %u\n", private->number);
+ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ return ret;
+}
+
+static int
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+ struct xt_table_info *newinfo, unsigned int num_counters,
+ void __user *counters_ptr)
+{
+ int ret;
+ struct xt_table *t;
+ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+ void *loc_cpu_old_entry;
+ struct ipt_entry *iter;
+
+ ret = 0;
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+ "iptable_%s", name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+ if (valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters, and synchronize with replace */
+ get_counters(oldinfo, counters);
+
+ /* Decrease module usage counts and free resource */
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter, net);
+
+ xt_free_table_info(oldinfo);
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+ }
+ vfree(counters);
+ xt_table_unlock(t);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+ vfree(counters);
+ out:
+ return ret;
+}
+
+static int
+do_replace(struct net *net, const void __user *user, unsigned int len)
+{
+ int ret;
+ struct ipt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ipt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+do_add_counters(struct net *net, const void __user *user,
+ unsigned int len, int compat)
+{
+ unsigned int i, curcpu;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ unsigned int num_counters;
+ const char *name;
+ int size;
+ void *ptmp;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ void *loc_cpu_entry;
+ struct ipt_entry *iter;
+ unsigned int addend;
+#ifdef CONFIG_COMPAT
+ struct compat_xt_counters_info compat_tmp;
+
+ if (compat) {
+ ptmp = &compat_tmp;
+ size = sizeof(struct compat_xt_counters_info);
+ } else
+#endif
+ {
+ ptmp = &tmp;
+ size = sizeof(struct xt_counters_info);
+ }
+
+ if (copy_from_user(ptmp, user, size) != 0)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ num_counters = compat_tmp.num_counters;
+ name = compat_tmp.name;
+ } else
+#endif
+ {
+ num_counters = tmp.num_counters;
+ name = tmp.name;
+ }
+
+ if (len != size + num_counters * sizeof(struct xt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len - size);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user + size, len - size) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = xt_find_table_lock(net, AF_INET, name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+ }
+
+ local_bh_disable();
+ private = t->private;
+ if (private->number != num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ /* Choose the copy that is on our node */
+ curcpu = smp_processor_id();
+ loc_cpu_entry = private->entries[curcpu];
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
+ unlock_up_free:
+ local_bh_enable();
+ xt_table_unlock(t);
+ module_put(t->me);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_INET_NUMHOOKS];
+ u32 underflow[NF_INET_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters; /* struct xt_counters * */
+ struct compat_ipt_entry entries[0];
+};
+
+static int
+compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
+ unsigned int *size, struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_ipt_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ const struct xt_entry_match *ematch;
+ int ret = 0;
+
+ origsize = *size;
+ ce = (struct compat_ipt_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_ipt_entry);
+ *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ target_offset = e->target_offset - (origsize - *size);
+ t = ipt_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+ const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask,
+ int *size)
+{
+ struct xt_match *match;
+
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
+ duprintf("compat_check_calc_match: `%s' not found\n",
+ m->u.user.name);
+ return PTR_ERR(match);
+ }
+ m->u.kernel.match = match;
+ *size += xt_compat_match_offset(match);
+ return 0;
+}
+
+static void compat_release_entry(struct compat_ipt_entry *e)
+{
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ module_put(ematch->u.kernel.match->me);
+ t = compat_ipt_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ const char *name)
+{
+ struct xt_entry_match *ematch;
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ unsigned int j;
+ int ret, off, h;
+
+ duprintf("check_compat_entry_size_and_hooks %p\n", e);
+ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+ duprintf("Bad offset %p, limit = %p\n", e, limit);
+ return -EINVAL;
+ }
+
+ if (e->next_offset < sizeof(struct compat_ipt_entry) +
+ sizeof(struct compat_xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct ipt_entry *)e, name);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+ entry_offset = (void *)e - (void *)base;
+ j = 0;
+ xt_ematch_foreach(ematch, e) {
+ ret = compat_find_calc_match(ematch, name,
+ &e->ip, e->comefrom, &off);
+ if (ret != 0)
+ goto release_matches;
+ ++j;
+ }
+
+ t = compat_ipt_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+ t->u.user.name);
+ ret = PTR_ERR(target);
+ goto release_matches;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+ if (ret)
+ goto out;
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* Clear counters and comefrom */
+ memset(&e->counters, 0, sizeof(e->counters));
+ e->comefrom = 0;
+ return 0;
+
+out:
+ module_put(t->u.kernel.target->me);
+release_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ module_put(ematch->u.kernel.match->me);
+ }
+ return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ struct ipt_entry *de;
+ unsigned int origsize;
+ int ret, h;
+ struct xt_entry_match *ematch;
+
+ ret = 0;
+ origsize = *size;
+ de = (struct ipt_entry *)*dstptr;
+ memcpy(de, e, sizeof(struct ipt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct ipt_entry);
+ *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_from_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_ipt_get_target(e);
+ target = t->u.kernel.target;
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+ return ret;
+}
+
+static int
+compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_match *ematch;
+ struct xt_mtchk_param mtpar;
+ unsigned int j;
+ int ret = 0;
+
+ j = 0;
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ip;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV4;
+ xt_ematch_foreach(ematch, e) {
+ ret = check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
+
+ ret = check_target(e, net, name);
+ if (ret)
+ goto cleanup_matches;
+ return 0;
+
+ cleanup_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+ return ret;
+}
+
+static int
+translate_compat_table(struct net *net,
+ const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_ipt_entry *iter0;
+ struct ipt_entry *iter1;
+ unsigned int size;
+ int ret;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = total_size;
+ info->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ info->hook_entry[i] = 0xFFFFFFFF;
+ info->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_compat_table: size %u\n", info->size);
+ j = 0;
+ xt_compat_lock(AF_INET);
+ xt_compat_init_offsets(AF_INET, number);
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != number) {
+ duprintf("translate_compat_table: %u not %u entries\n",
+ j, number);
+ goto out_unlock;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (info->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ goto out_unlock;
+ }
+ if (info->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ goto out_unlock;
+ }
+ }
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ newinfo->number = number;
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = info->hook_entry[i];
+ newinfo->underflow[i] = info->underflow[i];
+ }
+ entry1 = newinfo->entries[raw_smp_processor_id()];
+ pos = entry1;
+ size = total_size;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
+ xt_compat_flush_offsets(AF_INET);
+ xt_compat_unlock(AF_INET);
+ if (ret)
+ goto free_newinfo;
+
+ ret = -ELOOP;
+ if (!mark_source_chains(newinfo, valid_hooks, entry1))
+ goto free_newinfo;
+
+ i = 0;
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = compat_check_entry(iter1, net, name);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(ipt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
+ j -= i;
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1, net);
+ }
+ xt_free_table_info(newinfo);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i)
+ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+ memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+out:
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(AF_INET);
+ xt_compat_unlock(AF_INET);
+ goto out;
+}
+
+static int
+compat_do_replace(struct net *net, void __user *user, unsigned int len)
+{
+ int ret;
+ struct compat_ipt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ipt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.size >= INT_MAX / num_possible_cpus())
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("compat_do_replace: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
+ unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = compat_do_replace(sock_net(sk), user, len);
+ break;
+
+ case IPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 1);
+ break;
+
+ default:
+ duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct compat_ipt_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_ipt_entry entrytable[0];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ const void *loc_cpu_entry;
+ unsigned int i = 0;
+ struct ipt_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+
+ vfree(counters);
+ return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_ipt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+
+ if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ xt_compat_lock(AF_INET);
+ t = xt_find_table_lock(net, AF_INET, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+ duprintf("t->private->number = %u\n", private->number);
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret) {
+ duprintf("compat_get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ xt_compat_flush_offsets(AF_INET);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ xt_compat_unlock(AF_INET);
+ return ret;
+}
+
+static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int
+compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 1);
+ break;
+ case IPT_SO_GET_ENTRIES:
+ ret = compat_get_entries(sock_net(sk), user, len);
+ break;
+ default:
+ ret = do_ipt_get_ctl(sk, cmd, user, len);
+ }
+ return ret;
+}
+#endif
+
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = do_replace(sock_net(sk), user, len);
+ break;
+
+ case IPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 0);
+ break;
+
+ default:
+ duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 0);
+ break;
+
+ case IPT_SO_GET_ENTRIES:
+ ret = get_entries(sock_net(sk), user, len);
+ break;
+
+ case IPT_SO_GET_REVISION_MATCH:
+ case IPT_SO_GET_REVISION_TARGET: {
+ struct xt_get_revision rev;
+ int target;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ rev.name[sizeof(rev.name)-1] = 0;
+
+ if (cmd == IPT_SO_GET_REVISION_TARGET)
+ target = 1;
+ else
+ target = 0;
+
+ try_then_request_module(xt_find_revision(AF_INET, rev.name,
+ rev.revision,
+ target, &ret),
+ "ipt_%s", rev.name);
+ break;
+ }
+
+ default:
+ duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct xt_table *ipt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct ipt_replace *repl)
+{
+ int ret;
+ struct xt_table_info *newinfo;
+ struct xt_table_info bootstrap = {0};
+ void *loc_cpu_entry;
+ struct xt_table *new_table;
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* choose the copy on our node/cpu, but dont care about preemption */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+ if (ret != 0)
+ goto out_free;
+
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ ret = PTR_ERR(new_table);
+ goto out_free;
+ }
+
+ return new_table;
+
+out_free:
+ xt_free_table_info(newinfo);
+out:
+ return ERR_PTR(ret);
+}
+
+void ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ipt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline bool
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+ u_int8_t type, u_int8_t code,
+ bool invert)
+{
+ return ((test_type == 0xFF) ||
+ (type == test_type && code >= min_code && code <= max_code))
+ ^ invert;
+}
+
+static bool
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmphdr *ic;
+ struct icmphdr _icmph;
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (ic == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("Dropping evil ICMP tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->type, ic->code,
+ !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+
+static int icmp_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+}
+
+static struct xt_target ipt_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_IPV4,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = ipt_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_IPV4,
+ },
+};
+
+static struct nf_sockopt_ops ipt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IPT_BASE_CTL,
+ .set_optmax = IPT_SO_SET_MAX+1,
+ .set = do_ipt_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_ipt_set_ctl,
+#endif
+ .get_optmin = IPT_BASE_CTL,
+ .get_optmax = IPT_SO_GET_MAX+1,
+ .get = do_ipt_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_ipt_get_ctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+static struct xt_match ipt_builtin_mt[] __read_mostly = {
+ {
+ .name = "icmp",
+ .match = icmp_match,
+ .matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
+ .proto = IPPROTO_ICMP,
+ .family = NFPROTO_IPV4,
+ },
+};
+
+static int __net_init ip_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_IPV4);
+}
+
+static void __net_exit ip_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_IPV4);
+}
+
+static struct pernet_operations ip_tables_net_ops = {
+ .init = ip_tables_net_init,
+ .exit = ip_tables_net_exit,
+};
+
+static int __init ip_tables_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_tables_net_ops);
+ if (ret < 0)
+ goto err1;
+
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+ if (ret < 0)
+ goto err2;
+ ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+ if (ret < 0)
+ goto err4;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ipt_sockopts);
+ if (ret < 0)
+ goto err5;
+
+ pr_info("(C) 2000-2006 Netfilter Core Team\n");
+ return 0;
+
+err5:
+ xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+err4:
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+err2:
+ unregister_pernet_subsys(&ip_tables_net_ops);
+err1:
+ return ret;
+}
+
+static void __exit ip_tables_fini(void)
+{
+ nf_unregister_sockopt(&ipt_sockopts);
+
+ xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+ unregister_pernet_subsys(&ip_tables_net_ops);
+}
+
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_do_table);
+module_init(ip_tables_init);
+module_exit(ip_tables_fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 000000000..771ab3d01
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,794 @@
+/* Cluster IP hashmark target
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+
+#define CLUSTERIP_VERSION "0.8"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
+
+struct clusterip_config {
+ struct list_head list; /* list of all configs */
+ atomic_t refcount; /* reference count */
+ atomic_t entries; /* number of entries/rules
+ * referencing us */
+
+ __be32 clusterip; /* the IP address */
+ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
+ struct net_device *dev; /* device */
+ u_int16_t num_total_nodes; /* total number of nodes */
+ unsigned long local_nodes; /* node number array */
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde; /* proc dir entry */
+#endif
+ enum clusterip_hashmode hash_mode; /* which hashing mode */
+ u_int32_t hash_initval; /* hash initialization */
+ struct rcu_head rcu;
+};
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
+
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+ struct list_head configs;
+ /* lock protects the configs list */
+ spinlock_t lock;
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *procdir;
+#endif
+};
+
+static inline void
+clusterip_config_get(struct clusterip_config *c)
+{
+ atomic_inc(&c->refcount);
+}
+
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+ kfree(container_of(head, struct clusterip_config, rcu));
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c)
+{
+ if (atomic_dec_and_test(&c->refcount))
+ call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
+}
+
+/* decrease the count of entries using/referencing this config. If last
+ * entry(rule) is removed, remove the config from lists, but don't free it
+ * yet, since proc-files could still be holding references */
+static inline void
+clusterip_config_entry_put(struct clusterip_config *c)
+{
+ struct net *net = dev_net(c->dev);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ local_bh_disable();
+ if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
+ list_del_rcu(&c->list);
+ spin_unlock(&cn->lock);
+ local_bh_enable();
+
+ dev_mc_del(c->dev, c->clustermac);
+ dev_put(c->dev);
+
+ /* In case anyone still accesses the file, the open/close
+ * functions are also incrementing the refcount on their own,
+ * so it's safe to remove the entry even if it's in use. */
+#ifdef CONFIG_PROC_FS
+ proc_remove(c->pde);
+#endif
+ return;
+ }
+ local_bh_enable();
+}
+
+static struct clusterip_config *
+__clusterip_config_find(struct net *net, __be32 clusterip)
+{
+ struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ list_for_each_entry_rcu(c, &cn->configs, list) {
+ if (c->clusterip == clusterip)
+ return c;
+ }
+
+ return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
+{
+ struct clusterip_config *c;
+
+ rcu_read_lock_bh();
+ c = __clusterip_config_find(net, clusterip);
+ if (c) {
+ if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ c = NULL;
+ else if (entry)
+ atomic_inc(&c->entries);
+ }
+ rcu_read_unlock_bh();
+
+ return c;
+}
+
+static void
+clusterip_config_init_nodelist(struct clusterip_config *c,
+ const struct ipt_clusterip_tgt_info *i)
+{
+ int n;
+
+ for (n = 0; n < i->num_local_nodes; n++)
+ set_bit(i->local_nodes[n] - 1, &c->local_nodes);
+}
+
+static struct clusterip_config *
+clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
+ struct net_device *dev)
+{
+ struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
+
+ c = kzalloc(sizeof(*c), GFP_ATOMIC);
+ if (!c)
+ return NULL;
+
+ c->dev = dev;
+ c->clusterip = ip;
+ memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+ c->num_total_nodes = i->num_total_nodes;
+ clusterip_config_init_nodelist(c, i);
+ c->hash_mode = i->hash_mode;
+ c->hash_initval = i->hash_initval;
+ atomic_set(&c->refcount, 1);
+ atomic_set(&c->entries, 1);
+
+#ifdef CONFIG_PROC_FS
+ {
+ char buffer[16];
+
+ /* create proc dir entry */
+ sprintf(buffer, "%pI4", &ip);
+ c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
+ cn->procdir,
+ &clusterip_proc_fops, c);
+ if (!c->pde) {
+ kfree(c);
+ return NULL;
+ }
+ }
+#endif
+
+ spin_lock_bh(&cn->lock);
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
+
+ return c;
+}
+
+#ifdef CONFIG_PROC_FS
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
+ return 1;
+
+ /* check if we already have this number in our bitfield */
+ if (test_and_set_bit(nodenum - 1, &c->local_nodes))
+ return 1;
+
+ return 0;
+}
+
+static bool
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
+ return true;
+
+ if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
+ return false;
+
+ return true;
+}
+#endif
+
+static inline u_int32_t
+clusterip_hashfn(const struct sk_buff *skb,
+ const struct clusterip_config *config)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned long hashval;
+ u_int16_t sport = 0, dport = 0;
+ int poff;
+
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0) {
+ const u_int16_t *ports;
+ u16 _ports[2];
+
+ ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+ if (ports) {
+ sport = ports[0];
+ dport = ports[1];
+ }
+ } else {
+ net_info_ratelimited("unknown protocol %u\n", iph->protocol);
+ }
+
+ switch (config->hash_mode) {
+ case CLUSTERIP_HASHMODE_SIP:
+ hashval = jhash_1word(ntohl(iph->saddr),
+ config->hash_initval);
+ break;
+ case CLUSTERIP_HASHMODE_SIP_SPT:
+ hashval = jhash_2words(ntohl(iph->saddr), sport,
+ config->hash_initval);
+ break;
+ case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+ hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+ config->hash_initval);
+ break;
+ default:
+ /* to make gcc happy */
+ hashval = 0;
+ /* This cannot happen, unless the check function wasn't called
+ * at rule load time */
+ pr_info("unknown mode %u\n", config->hash_mode);
+ BUG();
+ break;
+ }
+
+ /* node numbers are 1..n, not 0..n */
+ return reciprocal_scale(hashval, config->num_total_nodes) + 1;
+}
+
+static inline int
+clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
+{
+ return test_bit(hash - 1, &config->local_nodes);
+}
+
+/***********************************************************************
+ * IPTABLES TARGET
+ ***********************************************************************/
+
+static unsigned int
+clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t hash;
+
+ /* don't need to clusterip_config_get() here, since refcount
+ * is only decremented by destroy() - and ip_tables guarantees
+ * that the ->target() function isn't called after ->destroy() */
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return NF_DROP;
+
+ /* special case: ICMP error handling. conntrack distinguishes between
+ * error messages (RELATED) and information requests (see below) */
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
+ (ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY))
+ return XT_CONTINUE;
+
+ /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
+ * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+ * on, which all have an ID field [relevant for hashing]. */
+
+ hash = clusterip_hashfn(skb, cipinfo->config);
+
+ switch (ctinfo) {
+ case IP_CT_NEW:
+ ct->mark = hash;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ /* FIXME: we don't handle expectations at the moment.
+ * They can arrive on a different node than
+ * the master connection (e.g. FTP passive mode) */
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+ default: /* Prevent gcc warnings */
+ break;
+ }
+
+#ifdef DEBUG
+ nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+ pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
+ if (!clusterip_responsible(cipinfo->config, hash)) {
+ pr_debug("not responsible\n");
+ return NF_DROP;
+ }
+ pr_debug("responsible\n");
+
+ /* despite being received via linklayer multicast, this is
+ * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+ skb->pkt_type = PACKET_HOST;
+
+ return XT_CONTINUE;
+}
+
+static int clusterip_tg_check(const struct xt_tgchk_param *par)
+{
+ struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+ struct clusterip_config *config;
+ int ret;
+
+ if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+ cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+ cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+ pr_info("unknown mode %u\n", cipinfo->hash_mode);
+ return -EINVAL;
+
+ }
+ if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
+ e->ip.dst.s_addr == 0) {
+ pr_info("Please specify destination IP\n");
+ return -EINVAL;
+ }
+
+ /* FIXME: further sanity checks */
+
+ config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
+ if (!config) {
+ if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+ pr_info("no config found for %pI4, need 'new'\n",
+ &e->ip.dst.s_addr);
+ return -EINVAL;
+ } else {
+ struct net_device *dev;
+
+ if (e->ip.iniface[0] == '\0') {
+ pr_info("Please specify an interface name\n");
+ return -EINVAL;
+ }
+
+ dev = dev_get_by_name(par->net, e->ip.iniface);
+ if (!dev) {
+ pr_info("no such interface %s\n",
+ e->ip.iniface);
+ return -ENOENT;
+ }
+
+ config = clusterip_config_init(cipinfo,
+ e->ip.dst.s_addr, dev);
+ if (!config) {
+ dev_put(dev);
+ return -ENOMEM;
+ }
+ dev_mc_add(config->dev, config->clustermac);
+ }
+ }
+ cipinfo->config = config;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+
+ if (!par->net->xt.clusterip_deprecated_warning) {
+ pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
+ "use xt_cluster instead\n");
+ par->net->xt.clusterip_deprecated_warning = true;
+ }
+
+ return ret;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+
+ /* if no more entries are referencing the config, remove it
+ * from the list and destroy the proc entry */
+ clusterip_config_entry_put(cipinfo->config);
+
+ clusterip_config_put(cipinfo->config);
+
+ nf_ct_l3proto_module_put(par->family);
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_clusterip_tgt_info
+{
+ u_int32_t flags;
+ u_int8_t clustermac[6];
+ u_int16_t num_total_nodes;
+ u_int16_t num_local_nodes;
+ u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
+ u_int32_t hash_mode;
+ u_int32_t hash_initval;
+ compat_uptr_t config;
+};
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target clusterip_tg_reg __read_mostly = {
+ .name = "CLUSTERIP",
+ .family = NFPROTO_IPV4,
+ .target = clusterip_tg,
+ .checkentry = clusterip_tg_check,
+ .destroy = clusterip_tg_destroy,
+ .targetsize = sizeof(struct ipt_clusterip_tgt_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
+#endif /* CONFIG_COMPAT */
+ .me = THIS_MODULE
+};
+
+
+/***********************************************************************
+ * ARP MANGLING CODE
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+ u_int8_t src_hw[ETH_ALEN];
+ __be32 src_ip;
+ u_int8_t dst_hw[ETH_ALEN];
+ __be32 dst_ip;
+} __packed;
+
+#ifdef DEBUG
+static void arp_print(struct arp_payload *payload)
+{
+#define HBUFFERLEN 30
+ char hbuffer[HBUFFERLEN];
+ int j,k;
+
+ for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
+ hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
+ hbuffer[k++]=':';
+ }
+ hbuffer[--k]='\0';
+
+ pr_debug("src %pI4@%s, dst %pI4\n",
+ &payload->src_ip, hbuffer, &payload->dst_ip);
+}
+#endif
+
+static unsigned int
+arp_mangle(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct arphdr *arp = arp_hdr(skb);
+ struct arp_payload *payload;
+ struct clusterip_config *c;
+ struct net *net = dev_net(state->in ? state->in : state->out);
+
+ /* we don't care about non-ethernet and non-ipv4 ARP */
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+ return NF_ACCEPT;
+
+ /* we only want to mangle arp requests and replies */
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ return NF_ACCEPT;
+
+ payload = (void *)(arp+1);
+
+ /* if there is no clusterip configuration for the arp reply's
+ * source ip, we don't want to mangle it */
+ c = clusterip_config_find_get(net, payload->src_ip, 0);
+ if (!c)
+ return NF_ACCEPT;
+
+ /* normally the linux kernel always replies to arp queries of
+ * addresses on different interfacs. However, in the CLUSTERIP case
+ * this wouldn't work, since we didn't subscribe the mcast group on
+ * other interfaces */
+ if (c->dev != state->out) {
+ pr_debug("not mangling arp reply on different "
+ "interface: cip'%s'-skb'%s'\n",
+ c->dev->name, state->out->name);
+ clusterip_config_put(c);
+ return NF_ACCEPT;
+ }
+
+ /* mangle reply hardware address */
+ memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef DEBUG
+ pr_debug("mangled arp reply: ");
+ arp_print(payload);
+#endif
+
+ clusterip_config_put(c);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops __read_mostly = {
+ .hook = arp_mangle,
+ .pf = NFPROTO_ARP,
+ .hooknum = NF_ARP_OUT,
+ .priority = -1
+};
+
+/***********************************************************************
+ * PROC DIR HANDLING
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+struct clusterip_seq_position {
+ unsigned int pos; /* position */
+ unsigned int weight; /* number of bits set == size */
+ unsigned int bit; /* current bit */
+ unsigned long val; /* current value */
+};
+
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct clusterip_config *c = s->private;
+ unsigned int weight;
+ u_int32_t local_nodes;
+ struct clusterip_seq_position *idx;
+
+ /* FIXME: possible race */
+ local_nodes = c->local_nodes;
+ weight = hweight32(local_nodes);
+ if (*pos >= weight)
+ return NULL;
+
+ idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
+ if (!idx)
+ return ERR_PTR(-ENOMEM);
+
+ idx->pos = *pos;
+ idx->weight = weight;
+ idx->bit = ffs(local_nodes);
+ idx->val = local_nodes;
+ clear_bit(idx->bit - 1, &idx->val);
+
+ return idx;
+}
+
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct clusterip_seq_position *idx = v;
+
+ *pos = ++idx->pos;
+ if (*pos >= idx->weight) {
+ kfree(v);
+ return NULL;
+ }
+ idx->bit = ffs(idx->val);
+ clear_bit(idx->bit - 1, &idx->val);
+ return idx;
+}
+
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+ if (!IS_ERR(v))
+ kfree(v);
+}
+
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+ struct clusterip_seq_position *idx = v;
+
+ if (idx->pos != 0)
+ seq_putc(s, ',');
+
+ seq_printf(s, "%u", idx->bit);
+
+ if (idx->pos == idx->weight - 1)
+ seq_putc(s, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations clusterip_seq_ops = {
+ .start = clusterip_seq_start,
+ .next = clusterip_seq_next,
+ .stop = clusterip_seq_stop,
+ .show = clusterip_seq_show,
+};
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &clusterip_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ struct clusterip_config *c = PDE_DATA(inode);
+
+ sf->private = c;
+
+ clusterip_config_get(c);
+ }
+
+ return ret;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+ struct clusterip_config *c = PDE_DATA(inode);
+ int ret;
+
+ ret = seq_release(inode, file);
+
+ if (!ret)
+ clusterip_config_put(c);
+
+ return ret;
+}
+
+static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *ofs)
+{
+ struct clusterip_config *c = PDE_DATA(file_inode(file));
+#define PROC_WRITELEN 10
+ char buffer[PROC_WRITELEN+1];
+ unsigned long nodenum;
+ int rc;
+
+ if (size > PROC_WRITELEN)
+ return -EIO;
+ if (copy_from_user(buffer, input, size))
+ return -EFAULT;
+ buffer[size] = 0;
+
+ if (*buffer == '+') {
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
+ if (clusterip_add_node(c, nodenum))
+ return -ENOMEM;
+ } else if (*buffer == '-') {
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
+ if (clusterip_del_node(c, nodenum))
+ return -ENOENT;
+ } else
+ return -EIO;
+
+ return size;
+}
+
+static const struct file_operations clusterip_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = clusterip_proc_open,
+ .read = seq_read,
+ .write = clusterip_proc_write,
+ .llseek = seq_lseek,
+ .release = clusterip_proc_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int clusterip_net_init(struct net *net)
+{
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ INIT_LIST_HEAD(&cn->configs);
+
+ spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+ cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+ if (!cn->procdir) {
+ pr_err("Unable to proc dir entry\n");
+ return -ENOMEM;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+ .init = clusterip_net_init,
+ .exit = clusterip_net_exit,
+ .id = &clusterip_net_id,
+ .size = sizeof(struct clusterip_net),
+};
+
+static int __init clusterip_tg_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&clusterip_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_register_target(&clusterip_tg_reg);
+ if (ret < 0)
+ goto cleanup_subsys;
+
+ ret = nf_register_hook(&cip_arp_ops);
+ if (ret < 0)
+ goto cleanup_target;
+
+ pr_info("ClusterIP Version %s loaded successfully\n",
+ CLUSTERIP_VERSION);
+
+ return 0;
+
+cleanup_target:
+ xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+ unregister_pernet_subsys(&clusterip_net_ops);
+ return ret;
+}
+
+static void __exit clusterip_tg_exit(void)
+{
+ pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
+
+ nf_unregister_hook(&cip_arp_ops);
+ xt_unregister_target(&clusterip_tg_reg);
+ unregister_pernet_subsys(&clusterip_net_ops);
+
+ /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+ rcu_barrier_bh();
+}
+
+module_init(clusterip_tg_init);
+module_exit(clusterip_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 000000000..4bf3dc49a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,138 @@
+/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ECN.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification");
+
+/* set ECT codepoint from IP header.
+ * return false if there was an error. */
+static inline bool
+set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
+ __u8 oldtos;
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return false;
+ iph = ip_hdr(skb);
+ oldtos = iph->tos;
+ iph->tos &= ~IPT_ECN_IP_MASK;
+ iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
+ csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+ }
+ return true;
+}
+
+/* Return false if there was an error. */
+static inline bool
+set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
+{
+ struct tcphdr _tcph, *tcph;
+ __be16 oldval;
+
+ /* Not enough header? */
+ tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return false;
+
+ if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+ tcph->ece == einfo->proto.tcp.ece) &&
+ (!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+ tcph->cwr == einfo->proto.tcp.cwr))
+ return true;
+
+ if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+ return false;
+ tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
+
+ oldval = ((__be16 *)tcph)[6];
+ if (einfo->operation & IPT_ECN_OP_SET_ECE)
+ tcph->ece = einfo->proto.tcp.ece;
+ if (einfo->operation & IPT_ECN_OP_SET_CWR)
+ tcph->cwr = einfo->proto.tcp.cwr;
+
+ inet_proto_csum_replace2(&tcph->check, skb,
+ oldval, ((__be16 *)tcph)[6], 0);
+ return true;
+}
+
+static unsigned int
+ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipt_ECN_info *einfo = par->targinfo;
+
+ if (einfo->operation & IPT_ECN_OP_SET_IP)
+ if (!set_ect_ip(skb, einfo))
+ return NF_DROP;
+
+ if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
+ ip_hdr(skb)->protocol == IPPROTO_TCP)
+ if (!set_ect_tcp(skb, einfo))
+ return NF_DROP;
+
+ return XT_CONTINUE;
+}
+
+static int ecn_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_ECN_info *einfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (einfo->operation & IPT_ECN_OP_MASK) {
+ pr_info("unsupported ECN operation %x\n", einfo->operation);
+ return -EINVAL;
+ }
+ if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
+ pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
+ return -EINVAL;
+ }
+ if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
+ (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
+ pr_info("cannot use TCP operations on a non-tcp rule\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target ecn_tg_reg __read_mostly = {
+ .name = "ECN",
+ .family = NFPROTO_IPV4,
+ .target = ecn_tg,
+ .targetsize = sizeof(struct ipt_ECN_info),
+ .table = "mangle",
+ .checkentry = ecn_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init ecn_tg_init(void)
+{
+ return xt_register_target(&ecn_tg_reg);
+}
+
+static void __exit ecn_tg_exit(void)
+{
+ xt_unregister_target(&ecn_tg_reg);
+}
+
+module_init(ecn_tg_init);
+module_exit(ecn_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 000000000..da7f02a0b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,91 @@
+/* Masquerade. Simple mapping which alters range to a local IP address
+ (depending on route). */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
+
+/* FIXME: Multiple targets. --RR */
+static int masquerade_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static unsigned int
+masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct nf_nat_range range;
+ const struct nf_nat_ipv4_multi_range_compat *mr;
+
+ mr = par->targinfo;
+ range.flags = mr->range[0].flags;
+ range.min_proto = mr->range[0].min;
+ range.max_proto = mr->range[0].max;
+
+ return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
+}
+
+static struct xt_target masquerade_tg_reg __read_mostly = {
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV4,
+ .target = masquerade_tg,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init masquerade_tg_init(void)
+{
+ int ret;
+
+ ret = xt_register_target(&masquerade_tg_reg);
+
+ if (ret == 0)
+ nf_nat_masquerade_ipv4_register_notifier();
+
+ return ret;
+}
+
+static void __exit masquerade_tg_exit(void)
+{
+ xt_unregister_target(&masquerade_tg_reg);
+ nf_nat_masquerade_ipv4_unregister_notifier();
+}
+
+module_init(masquerade_tg_init);
+module_exit(masquerade_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 000000000..87907d4bd
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,113 @@
+/*
+ * This is a module which is used for rejecting packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+#include <linux/netfilter_bridge.h>
+#endif
+
+#include <net/netfilter/ipv4/nf_reject.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
+
+static unsigned int
+reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipt_reject_info *reject = par->targinfo;
+ int hook = par->hooknum;
+
+ switch (reject->with) {
+ case IPT_ICMP_NET_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
+ break;
+ case IPT_ICMP_HOST_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
+ break;
+ case IPT_ICMP_PROT_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
+ break;
+ case IPT_ICMP_PORT_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
+ break;
+ case IPT_ICMP_NET_PROHIBITED:
+ nf_send_unreach(skb, ICMP_NET_ANO, hook);
+ break;
+ case IPT_ICMP_HOST_PROHIBITED:
+ nf_send_unreach(skb, ICMP_HOST_ANO, hook);
+ break;
+ case IPT_ICMP_ADMIN_PROHIBITED:
+ nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
+ break;
+ case IPT_TCP_RESET:
+ nf_send_reset(skb, hook);
+ case IPT_ICMP_ECHOREPLY:
+ /* Doesn't happen. */
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static int reject_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_reject_info *rejinfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+ pr_info("ECHOREPLY no longer supported.\n");
+ return -EINVAL;
+ } else if (rejinfo->with == IPT_TCP_RESET) {
+ /* Must specify that it's a TCP packet */
+ if (e->ip.proto != IPPROTO_TCP ||
+ (e->ip.invflags & XT_INV_PROTO)) {
+ pr_info("TCP_RESET invalid for non-tcp\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static struct xt_target reject_tg_reg __read_mostly = {
+ .name = "REJECT",
+ .family = NFPROTO_IPV4,
+ .target = reject_tg,
+ .targetsize = sizeof(struct ipt_reject_info),
+ .table = "filter",
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = reject_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init reject_tg_init(void)
+{
+ return xt_register_target(&reject_tg_reg);
+}
+
+static void __exit reject_tg_exit(void)
+{
+ xt_unregister_target(&reject_tg_reg);
+}
+
+module_init(reject_tg_init);
+module_exit(reject_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 000000000..e9e677930
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct iphdr *
+synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = 0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = sysctl_ip_default_ttl;
+ iph->protocol = IPPROTO_TCP;
+ iph->check = 0;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct iphdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ skb_dst_set_noref(nskb, skb_dst(skb));
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ if (nfct) {
+ nskb->nfct = nfct;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nfct);
+ }
+
+ ip_local_out(nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ int mss;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ return true;
+}
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_synproxy_info *info = par->targinfo;
+ struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_options opts = {};
+ struct tcphdr *th, _th;
+
+ if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ return NF_DROP;
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
+
+ if (th->syn && !(th->ack || th->fin || th->rst)) {
+ /* Initial SYN from client */
+ this_cpu_inc(snet->stats->syn_received);
+
+ if (th->ece && th->cwr)
+ opts.options |= XT_SYNPROXY_OPT_ECN;
+
+ opts.options &= info->options;
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, &opts);
+ else
+ opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM |
+ XT_SYNPROXY_OPT_ECN);
+
+ synproxy_send_client_synack(skb, th, &opts);
+ return NF_DROP;
+
+ } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+ /* ACK from client */
+ synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ return NF_DROP;
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ unsigned int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (synproxy == NULL)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ thoff = ip_hdrlen(skb);
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ ntohl(th->seq) + 1))
+ this_cpu_inc(snet->stats->cookie_retrans);
+
+ return NF_DROP;
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->tsoff = opts.tsval - synproxy->its;
+
+ opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+ XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(snet, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (e->ip.proto != IPPROTO_TCP ||
+ e->ip.invflags & XT_INV_PROTO)
+ return -EINVAL;
+
+ return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV4,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+ .target = synproxy_tg4,
+ .targetsize = sizeof(struct xt_synproxy_info),
+ .checkentry = synproxy_tg4_check,
+ .destroy = synproxy_tg4_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+static int __init synproxy_tg4_init(void)
+{
+ int err;
+
+ err = nf_register_hooks(ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err < 0)
+ goto err1;
+
+ err = xt_register_target(&synproxy_tg4_reg);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+err1:
+ return err;
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+ xt_unregister_target(&synproxy_tg4_reg);
+ nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 000000000..14a2aa8b8
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,91 @@
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+ bool r;
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r=(spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+ const struct ipt_ah *ahinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ pr_debug("Dropping evil AH tinygram.\n");
+ par->hotdrop = true;
+ return 0;
+ }
+
+ return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+
+static int ah_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ipt_ah *ahinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+ pr_debug("unknown flags %X\n", ahinfo->invflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match ah_mt_reg __read_mostly = {
+ .name = "ah",
+ .family = NFPROTO_IPV4,
+ .match = ah_mt,
+ .matchsize = sizeof(struct ipt_ah),
+ .proto = IPPROTO_AH,
+ .checkentry = ah_mt_check,
+ .me = THIS_MODULE,
+};
+
+static int __init ah_mt_init(void)
+{
+ return xt_register_match(&ah_mt_reg);
+}
+
+static void __exit ah_mt_exit(void)
+{
+ xt_unregister_match(&ah_mt_reg);
+}
+
+module_init(ah_mt_init);
+module_exit(ah_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
new file mode 100644
index 000000000..4bfaedf9b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 rpfilter_get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
+ const struct net_device *dev, u8 flags)
+{
+ struct fib_result res;
+ bool dev_match;
+ struct net *net = dev_net(dev);
+ int ret __maybe_unused;
+
+ if (fib_lookup(net, fl4, &res))
+ return false;
+
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
+ return false;
+ }
+ dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
+#endif
+ if (dev_match || flags & XT_RPFILTER_LOOSE)
+ return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
+ return dev_match;
+}
+
+static bool rpfilter_is_local(const struct sk_buff *skb)
+{
+ const struct rtable *rt = skb_rtable(skb);
+ return rt && (rt->rt_flags & RTCF_LOCAL);
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rpfilter_info *info;
+ const struct iphdr *iph;
+ struct flowi4 flow;
+ bool invert;
+
+ info = par->matchinfo;
+ invert = info->flags & XT_RPFILTER_INVERT;
+
+ if (rpfilter_is_local(skb))
+ return true ^ invert;
+
+ iph = ip_hdr(skb);
+ if (ipv4_is_multicast(iph->daddr)) {
+ if (ipv4_is_zeronet(iph->saddr))
+ return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ }
+ flow.flowi4_iif = LOOPBACK_IFINDEX;
+ flow.daddr = iph->saddr;
+ flow.saddr = rpfilter_get_saddr(iph->daddr);
+ flow.flowi4_oif = 0;
+ flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ flow.flowi4_tos = RT_TOS(iph->tos);
+ flow.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+ if (info->flags & options) {
+ pr_info("unknown options encountered");
+ return -EINVAL;
+ }
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "raw") != 0) {
+ pr_info("match only valid in the \'raw\' "
+ "or \'mangle\' tables, not \'%s\'.\n", par->table);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+ .name = "rpfilter",
+ .family = NFPROTO_IPV4,
+ .checkentry = rpfilter_check,
+ .match = rpfilter_mt,
+ .matchsize = sizeof(struct xt_rpfilter_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING),
+ .me = THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+ return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+ xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 000000000..a0f3beca5
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,109 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_FILTER,
+};
+
+static unsigned int
+iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net;
+
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* root is playing with raw sockets. */
+ return NF_ACCEPT;
+
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
+}
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static bool forward = true;
+module_param(forward, bool, 0000);
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ /* Entry 1 is the FORWARD hook */
+ ((struct ipt_standard *)repl->entries)[1].target.verdict =
+ forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+
+ net->ipv4.iptable_filter =
+ ipt_register_table(net, &packet_filter, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+}
+
+static void __net_exit iptable_filter_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_filter);
+}
+
+static struct pernet_operations iptable_filter_net_ops = {
+ .init = iptable_filter_net_init,
+ .exit = iptable_filter_net_exit,
+};
+
+static int __init iptable_filter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_filter_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
+ if (IS_ERR(filter_ops)) {
+ ret = PTR_ERR(filter_ops);
+ unregister_pernet_subsys(&iptable_filter_net_ops);
+ }
+
+ return ret;
+}
+
+static void __exit iptable_filter_fini(void)
+{
+ xt_hook_unlink(&packet_filter, filter_ops);
+ unregister_pernet_subsys(&iptable_filter_net_ops);
+}
+
+module_init(iptable_filter_init);
+module_exit(iptable_filter_fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 000000000..62cbb8c5f
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,147 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_POST_ROUTING))
+
+static const struct xt_table packet_mangler = {
+ .name = "mangle",
+ .valid_hooks = MANGLE_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_MANGLE,
+};
+
+static unsigned int
+ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+{
+ struct net_device *out = state->out;
+ unsigned int ret;
+ const struct iphdr *iph;
+ u_int8_t tos;
+ __be32 saddr, daddr;
+ u_int32_t mark;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ /* Save things which could affect route */
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
+ dev_net(out)->ipv4.iptable_mangle);
+ /* Reroute for ANY change. */
+ if (ret != NF_DROP && ret != NF_STOLEN) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+
+ return ret;
+}
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_mangle_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (ops->hooknum == NF_INET_LOCAL_OUT)
+ return ipt_mangle_out(skb, state);
+ if (ops->hooknum == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, ops->hooknum, state,
+ dev_net(state->out)->ipv4.iptable_mangle);
+ /* PREROUTING/INPUT/FORWARD: */
+ return ipt_do_table(skb, ops->hooknum, state,
+ dev_net(state->in)->ipv4.iptable_mangle);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
+
+static int __net_init iptable_mangle_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_mangler);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_mangle =
+ ipt_register_table(net, &packet_mangler, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+}
+
+static void __net_exit iptable_mangle_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_mangle);
+}
+
+static struct pernet_operations iptable_mangle_net_ops = {
+ .init = iptable_mangle_net_init,
+ .exit = iptable_mangle_net_exit,
+};
+
+static int __init iptable_mangle_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_mangle_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
+ }
+
+ return ret;
+}
+
+static void __exit iptable_mangle_fini(void)
+{
+ xt_hook_unlink(&packet_mangler, mangle_ops);
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
+}
+
+module_init(iptable_mangle_init);
+module_exit(iptable_mangle_fini);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 000000000..0d4d9cdf9
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,154 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv4_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+};
+
+static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
+}
+
+static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
+}
+
+static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
+}
+
+static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
+}
+
+static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
+}
+
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = iptable_nat_ipv4_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = iptable_nat_ipv4_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = iptable_nat_ipv4_local_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = iptable_nat_ipv4_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+static int __net_init iptable_nat_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.nat_table);
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+ .init = iptable_nat_net_init,
+ .exit = iptable_nat_net_exit,
+};
+
+static int __init iptable_nat_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&iptable_nat_net_ops);
+ if (err < 0)
+ goto err1;
+
+ err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+err1:
+ return err;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+ nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 000000000..0356e6da4
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,89 @@
+/*
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_raw = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_RAW,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net;
+
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* root is playing with raw sockets. */
+ return NF_ACCEPT;
+
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
+}
+
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int __net_init iptable_raw_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_raw);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_raw =
+ ipt_register_table(net, &packet_raw, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+}
+
+static void __net_exit iptable_raw_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_raw);
+}
+
+static struct pernet_operations iptable_raw_net_ops = {
+ .init = iptable_raw_net_init,
+ .exit = iptable_raw_net_exit,
+};
+
+static int __init iptable_raw_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_raw_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
+ if (IS_ERR(rawtable_ops)) {
+ ret = PTR_ERR(rawtable_ops);
+ unregister_pernet_subsys(&iptable_raw_net_ops);
+ }
+
+ return ret;
+}
+
+static void __exit iptable_raw_fini(void)
+{
+ xt_hook_unlink(&packet_raw, rawtable_ops);
+ unregister_pernet_subsys(&iptable_raw_net_ops);
+}
+
+module_init(iptable_raw_init);
+module_exit(iptable_raw_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
new file mode 100644
index 000000000..4bce3980c
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -0,0 +1,109 @@
+/*
+ * "security" table
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("iptables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+ .name = "security",
+ .valid_hooks = SECURITY_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_SECURITY,
+};
+
+static unsigned int
+iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net;
+
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* Somebody is playing with raw sockets. */
+ return NF_ACCEPT;
+
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state,
+ net->ipv4.iptable_security);
+}
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int __net_init iptable_security_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&security_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_security =
+ ipt_register_table(net, &security_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+}
+
+static void __net_exit iptable_security_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_security);
+}
+
+static struct pernet_operations iptable_security_net_ops = {
+ .init = iptable_security_net_init,
+ .exit = iptable_security_net_exit,
+};
+
+static int __init iptable_security_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_security_net_ops);
+ if (ret < 0)
+ return ret;
+
+ sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
+ if (IS_ERR(sectbl_ops)) {
+ ret = PTR_ERR(sectbl_ops);
+ goto cleanup_table;
+ }
+
+ return ret;
+
+cleanup_table:
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ return ret;
+}
+
+static void __exit iptable_security_fini(void)
+{
+ xt_hook_unlink(&security_table, sectbl_ops);
+ unregister_pernet_subsys(&iptable_security_net_ops);
+}
+
+module_init(iptable_security_init);
+module_exit(iptable_security_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
new file mode 100644
index 000000000..30ad9554b
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -0,0 +1,542 @@
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/nf_log.h>
+
+static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const __be32 *ap;
+ __be32 _addrs[2];
+ ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
+ sizeof(u_int32_t) * 2, _addrs);
+ if (ap == NULL)
+ return false;
+
+ tuple->src.u3.ip = ap[0];
+ tuple->dst.u3.ip = ap[1];
+
+ return true;
+}
+
+static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u3.ip = orig->dst.u3.ip;
+ tuple->dst.u3.ip = orig->src.u3.ip;
+
+ return true;
+}
+
+static void ipv4_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "src=%pI4 dst=%pI4 ",
+ &tuple->src.u3.ip, &tuple->dst.u3.ip);
+}
+
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ unsigned int *dataoff, u_int8_t *protonum)
+{
+ const struct iphdr *iph;
+ struct iphdr _iph;
+
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (iph == NULL)
+ return -NF_ACCEPT;
+
+ /* Conntrack defragments packets, we might still see fragments
+ * inside ICMP packets though. */
+ if (iph->frag_off & htons(IP_OFFSET))
+ return -NF_ACCEPT;
+
+ *dataoff = nhoff + (iph->ihl << 2);
+ *protonum = iph->protocol;
+
+ /* Check bogus IP headers */
+ if (*dataoff > skb->len) {
+ pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
+ "nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
+}
+
+static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
+}
+
+static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+ return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ make it the first hook. */
+static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
+ {
+ .hook = ipv4_conntrack_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_conntrack_local,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table ip_ct_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_count",
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_buckets",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_checksum",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_log_invalid",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &log_invalid_proto_min,
+ .extra2 = &log_invalid_proto_max,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+ mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+
+ memset(&tuple, 0, sizeof(tuple));
+ tuple.src.u3.ip = inet->inet_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.ip = inet->inet_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.src.l3num = PF_INET;
+ tuple.dst.protonum = sk->sk_protocol;
+
+ /* We only do TCP and SCTP at the moment: is there a better way? */
+ if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) {
+ pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+ pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+ if (h) {
+ struct sockaddr_in sin;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u3.ip;
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
+ &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ nf_ct_put(ct);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
+ &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
+ [CTA_IP_V4_SRC] = { .type = NLA_U32 },
+ [CTA_IP_V4_DST] = { .type = NLA_U32 },
+};
+
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
+ return -EINVAL;
+
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+
+ return 0;
+}
+
+static int ipv4_nlattr_tuple_size(void)
+{
+ return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
+}
+#endif
+
+static struct nf_sockopt_ops so_getorigdst = {
+ .pf = PF_INET,
+ .get_optmin = SO_ORIGINAL_DST,
+ .get_optmax = SO_ORIGINAL_DST+1,
+ .get = getorigdst,
+ .owner = THIS_MODULE,
+};
+
+static int ipv4_init_net(struct net *net)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ struct nf_ip_net *in = &net->ct.nf_ct_proto;
+ in->ctl_table = kmemdup(ip_ct_sysctl_table,
+ sizeof(ip_ct_sysctl_table),
+ GFP_KERNEL);
+ if (!in->ctl_table)
+ return -ENOMEM;
+
+ in->ctl_table[0].data = &nf_conntrack_max;
+ in->ctl_table[1].data = &net->ct.count;
+ in->ctl_table[2].data = &net->ct.htable_size;
+ in->ctl_table[3].data = &net->ct.sysctl_checksum;
+ in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
+#endif
+ return 0;
+}
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
+ .l3proto = PF_INET,
+ .name = "ipv4",
+ .pkt_to_tuple = ipv4_pkt_to_tuple,
+ .invert_tuple = ipv4_invert_tuple,
+ .print_tuple = ipv4_print_tuple,
+ .get_l4proto = ipv4_get_l4proto,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = ipv4_tuple_to_nlattr,
+ .nlattr_tuple_size = ipv4_nlattr_tuple_size,
+ .nlattr_to_tuple = ipv4_nlattr_to_tuple,
+ .nla_policy = ipv4_nla_policy,
+#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ .ctl_table_path = "net/ipv4/netfilter",
+#endif
+ .init_net = ipv4_init_net,
+ .me = THIS_MODULE,
+};
+
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+ &nf_conntrack_htable_size, 0600);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+MODULE_ALIAS("ip_conntrack");
+MODULE_LICENSE("GPL");
+
+static int ipv4_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_tcp4: pernet registration failed\n");
+ goto out_tcp;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udp4: pernet registration failed\n");
+ goto out_udp;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_icmp4: pernet registration failed\n");
+ goto out_icmp;
+ }
+ ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: pernet registration failed\n");
+ goto out_ipv4;
+ }
+ return 0;
+out_ipv4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+out_icmp:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+out_udp:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+out_tcp:
+ return ret;
+}
+
+static void ipv4_net_exit(struct net *net)
+{
+ nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+}
+
+static struct pernet_operations ipv4_net_ops = {
+ .init = ipv4_net_init,
+ .exit = ipv4_net_exit,
+};
+
+static int __init nf_conntrack_l3proto_ipv4_init(void)
+{
+ int ret = 0;
+
+ need_conntrack();
+ nf_defrag_ipv4_enable();
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret < 0) {
+ printk(KERN_ERR "Unable to register netfilter socket option\n");
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&ipv4_net_ops);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
+ goto cleanup_sockopt;
+ }
+
+ ret = nf_register_hooks(ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+ goto cleanup_pernet;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
+ goto cleanup_hooks;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
+ goto cleanup_tcp4;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
+ goto cleanup_udp4;
+ }
+
+ ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
+ goto cleanup_icmpv4;
+ }
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ ret = nf_conntrack_ipv4_compat_init();
+ if (ret < 0)
+ goto cleanup_proto;
+#endif
+ return ret;
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ cleanup_proto:
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+#endif
+ cleanup_icmpv4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ cleanup_hooks:
+ nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ cleanup_pernet:
+ unregister_pernet_subsys(&ipv4_net_ops);
+ cleanup_sockopt:
+ nf_unregister_sockopt(&so_getorigdst);
+ return ret;
+}
+
+static void __exit nf_conntrack_l3proto_ipv4_fini(void)
+{
+ synchronize_net();
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ nf_conntrack_ipv4_compat_fini();
+#endif
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ unregister_pernet_subsys(&ipv4_net_ops);
+ nf_unregister_sockopt(&so_getorigdst);
+}
+
+module_init(nf_conntrack_l3proto_ipv4_init);
+module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
new file mode 100644
index 000000000..f0dfe92a0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -0,0 +1,469 @@
+/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
+#include <linux/export.h>
+
+struct ct_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_iter_state *st = seq->private;
+ struct hlist_nulls_node *n;
+
+ for (st->bucket = 0;
+ st->bucket < net->ct.htable_size;
+ st->bucket++) {
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ if (!is_a_nulls(n))
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+ struct hlist_nulls_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
+ while (is_a_nulls(head)) {
+ if (likely(get_nulls_value(head) == st->bucket)) {
+ if (++st->bucket >= net->ct.htable_size)
+ return NULL;
+ }
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_nulls_node *head = ct_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return;
+
+ seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+}
+#else
+static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+}
+#endif
+
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_tuple_hash *hash = v;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ int ret = 0;
+
+ NF_CT_ASSERT(ct);
+ if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ return 0;
+
+
+ /* we only want to print DIR_ORIGINAL */
+ if (NF_CT_DIRECTION(hash))
+ goto release;
+ if (nf_ct_l3num(ct) != AF_INET)
+ goto release;
+
+ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+ NF_CT_ASSERT(l3proto);
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ NF_CT_ASSERT(l4proto);
+
+ ret = -ENOSPC;
+ seq_printf(s, "%-8s %u %ld ",
+ l4proto->name, nf_ct_protonum(ct),
+ timer_pending(&ct->timeout)
+ ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
+
+ if (l4proto->print_conntrack)
+ l4proto->print_conntrack(s, ct);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
+ goto release;
+
+ if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
+ seq_printf(s, "[UNREPLIED] ");
+
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
+ goto release;
+
+ if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ seq_printf(s, "[ASSURED] ");
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ seq_printf(s, "mark=%u ", ct->mark);
+#endif
+
+ ct_show_secctx(s, ct);
+
+ seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ ret = 0;
+release:
+ nf_ct_put(ct);
+ return ret;
+}
+
+static const struct seq_operations ct_seq_ops = {
+ .start = ct_seq_start,
+ .next = ct_seq_next,
+ .stop = ct_seq_stop,
+ .show = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ct_seq_ops,
+ sizeof(struct ct_iter_state));
+}
+
+static const struct file_operations ct_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ct_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+/* expects */
+struct ct_expect_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+ struct hlist_node *n;
+
+ for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+ n = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ if (n)
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+ struct hlist_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_next_rcu(head));
+ while (head == NULL) {
+ if (++st->bucket >= nf_ct_expect_hsize)
+ return NULL;
+ head = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head = ct_expect_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_expect_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ct_expect_get_idx(seq, *pos);
+}
+
+static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_expect_get_next(seq, v);
+}
+
+static void exp_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_expect *exp;
+ const struct hlist_node *n = v;
+
+ exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
+
+ if (exp->tuple.src.l3num != AF_INET)
+ return 0;
+
+ if (exp->timeout.function)
+ seq_printf(s, "%ld ", timer_pending(&exp->timeout)
+ ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
+ else
+ seq_printf(s, "- ");
+
+ seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
+
+ print_tuple(s, &exp->tuple,
+ __nf_ct_l3proto_find(exp->tuple.src.l3num),
+ __nf_ct_l4proto_find(exp->tuple.src.l3num,
+ exp->tuple.dst.protonum));
+ seq_putc(s, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations exp_seq_ops = {
+ .start = exp_seq_start,
+ .next = exp_seq_next,
+ .stop = exp_seq_stop,
+ .show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &exp_seq_ops,
+ sizeof(struct ct_expect_iter_state));
+}
+
+static const struct file_operations ip_exp_file_ops = {
+ .owner = THIS_MODULE,
+ .open = exp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(net->ct.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(net->ct.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ const struct ip_conntrack_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ nr_conntracks,
+ st->searched,
+ st->found,
+ st->new,
+ st->invalid,
+ st->ignore,
+ st->delete,
+ st->delete_list,
+ st->insert,
+ st->insert_failed,
+ st->drop,
+ st->early_drop,
+ st->error,
+
+ st->expect_new,
+ st->expect_create,
+ st->expect_delete,
+ st->search_restart
+ );
+ return 0;
+}
+
+static const struct seq_operations ct_cpu_seq_ops = {
+ .start = ct_cpu_seq_start,
+ .next = ct_cpu_seq_next,
+ .stop = ct_cpu_seq_stop,
+ .show = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ct_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ct_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ct_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init ip_conntrack_net_init(struct net *net)
+{
+ struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+
+ proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
+ if (!proc)
+ goto err1;
+
+ proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
+ &ip_exp_file_ops);
+ if (!proc_exp)
+ goto err2;
+
+ proc_stat = proc_create("ip_conntrack", S_IRUGO,
+ net->proc_net_stat, &ct_cpu_seq_fops);
+ if (!proc_stat)
+ goto err3;
+ return 0;
+
+err3:
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+err2:
+ remove_proc_entry("ip_conntrack", net->proc_net);
+err1:
+ return -ENOMEM;
+}
+
+static void __net_exit ip_conntrack_net_exit(struct net *net)
+{
+ remove_proc_entry("ip_conntrack", net->proc_net_stat);
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+ remove_proc_entry("ip_conntrack", net->proc_net);
+}
+
+static struct pernet_operations ip_conntrack_net_ops = {
+ .init = ip_conntrack_net_init,
+ .exit = ip_conntrack_net_exit,
+};
+
+int __init nf_conntrack_ipv4_compat_init(void)
+{
+ return register_pernet_subsys(&ip_conntrack_net_ops);
+}
+
+void __exit nf_conntrack_ipv4_compat_fini(void)
+{
+ unregister_pernet_subsys(&ip_conntrack_net_ops);
+}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
new file mode 100644
index 000000000..80d5554b9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -0,0 +1,428 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+
+static inline struct nf_icmp_net *icmp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp;
+}
+
+static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct icmphdr *hp;
+ struct icmphdr _hdr;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->dst.u.icmp.type = hp->type;
+ tuple->src.u.icmp.id = hp->un.echo.id;
+ tuple->dst.u.icmp.code = hp->code;
+
+ return true;
+}
+
+/* Add 1; spaces filled with 0. */
+static const u_int8_t invmap[] = {
+ [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+ [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+ [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+ [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+ [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+ [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+ [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+ [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
+};
+
+static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ if (orig->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[orig->dst.u.icmp.type])
+ return false;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void icmp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+static unsigned int *icmp_get_timeouts(struct net *net)
+{
+ return &icmp_pernet(net)->timeout;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeout)
+{
+ /* Do not immediately delete the connection after the first
+ successful reply to avoid excessive conntrackd traffic
+ and also to handle correctly ICMP echo reply duplicates. */
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ static const u_int8_t valid_new[] = {
+ [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1
+ };
+
+ if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
+ !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
+ /* Can't create a new ICMP `conn' with this. */
+ pr_debug("icmp: can't create new conn with type %u\n",
+ ct->tuplehash[0].tuple.dst.u.icmp.type);
+ nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
+ return false;
+ }
+ return true;
+}
+
+/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
+static int
+icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct nf_conntrack_tuple innertuple, origtuple;
+ const struct nf_conntrack_l4proto *innerproto;
+ const struct nf_conntrack_tuple_hash *h;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+ NF_CT_ASSERT(skb->nfct == NULL);
+
+ /* Are they talking about one of our connections? */
+ if (!nf_ct_get_tuplepr(skb,
+ skb_network_offset(skb) + ip_hdrlen(skb)
+ + sizeof(struct icmphdr),
+ PF_INET, &origtuple)) {
+ pr_debug("icmp_error_message: failed to get tuple\n");
+ return -NF_ACCEPT;
+ }
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple,
+ &nf_conntrack_l3proto_ipv4, innerproto)) {
+ pr_debug("icmp_error_message: no match\n");
+ return -NF_ACCEPT;
+ }
+
+ *ctinfo = IP_CT_RELATED;
+
+ h = nf_conntrack_find_get(net, zone, &innertuple);
+ if (!h) {
+ pr_debug("icmp_error_message: no match\n");
+ return -NF_ACCEPT;
+ }
+
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return NF_ACCEPT;
+}
+
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+{
+ const struct icmphdr *icmph;
+ struct icmphdr _ih;
+
+ /* Not enough header? */
+ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
+ if (icmph == NULL) {
+ if (LOG_INVALID(net, IPPROTO_ICMP))
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
+ NULL, "nf_ct_icmp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* See ip_conntrack_proto_tcp.c */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_ip_checksum(skb, hooknum, dataoff, 0)) {
+ if (LOG_INVALID(net, IPPROTO_ICMP))
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: bad HW ICMP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ if (LOG_INVALID(net, IPPROTO_ICMP))
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: invalid ICMP type ");
+ return -NF_ACCEPT;
+ }
+
+ /* Need to track icmp error message? */
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_SOURCE_QUENCH &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB &&
+ icmph->type != ICMP_REDIRECT)
+ return NF_ACCEPT;
+
+ return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int icmp_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *t)
+{
+ if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 },
+};
+
+static int icmp_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *tuple)
+{
+ if (!tb[CTA_PROTO_ICMP_TYPE] ||
+ !tb[CTA_PROTO_ICMP_CODE] ||
+ !tb[CTA_PROTO_ICMP_ID])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+
+ if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type])
+ return -EINVAL;
+
+ return 0;
+}
+
+static int icmp_nlattr_tuple_size(void)
+{
+ return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_icmp_net *in = icmp_pernet(net);
+
+ if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
+ } else {
+ /* Set default ICMP timeout. */
+ *timeout = in->timeout;
+ }
+ return 0;
+}
+
+static int
+icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
+ [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table icmp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_icmp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table icmp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_icmp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(icmp_sysctl_table,
+ sizeof(icmp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &in->timeout;
+#endif
+ return 0;
+}
+
+static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
+ sizeof(icmp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &in->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int icmp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_icmp_net *in = icmp_pernet(net);
+ struct nf_proto_net *pn = &in->pn;
+
+ in->timeout = nf_ct_icmp_timeout;
+
+ ret = icmp_kmemdup_compat_sysctl_table(pn, in);
+ if (ret < 0)
+ return ret;
+
+ ret = icmp_kmemdup_sysctl_table(pn, in);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *icmp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_ICMP,
+ .name = "icmp",
+ .pkt_to_tuple = icmp_pkt_to_tuple,
+ .invert_tuple = icmp_invert_tuple,
+ .print_tuple = icmp_print_tuple,
+ .packet = icmp_packet,
+ .get_timeouts = icmp_get_timeouts,
+ .new = icmp_new,
+ .error = icmp_error,
+ .destroy = NULL,
+ .me = NULL,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = icmp_tuple_to_nlattr,
+ .nlattr_tuple_size = icmp_nlattr_tuple_size,
+ .nlattr_to_tuple = icmp_nlattr_to_tuple,
+ .nla_policy = icmp_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = icmp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = icmp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_ICMP_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = icmp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = icmp_init_net,
+ .get_net_proto = icmp_get_net_proto,
+};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 000000000..c88b7d434
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,129 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ int err;
+
+ skb_orphan(skb);
+
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
+
+ if (!err) {
+ ip_send_check(ip_hdr(skb));
+ skb->ignore_df = 1;
+ }
+
+ return err;
+}
+
+static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone = NF_CT_DEFAULT_ZONE;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (skb->nfct)
+ zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+#endif
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (skb->nf_bridge &&
+ skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP_DEFRAG_CONNTRACK_IN + zone;
+ else
+ return IP_DEFRAG_CONNTRACK_OUT + zone;
+}
+
+static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (sk && (sk->sk_family == PF_INET) &&
+ inet->nodefrag)
+ return NF_ACCEPT;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ return NF_ACCEPT;
+#endif
+#endif
+ /* Gather fragments. */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ enum ip_defrag_users user =
+ nf_ct_defrag_user(ops->hooknum, skb);
+
+ if (nf_ct_ipv4_gather_frags(skb, user))
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops ipv4_defrag_ops[] = {
+ {
+ .hook = ipv4_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+ },
+ {
+ .hook = ipv4_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+ },
+};
+
+static int __init nf_defrag_init(void)
+{
+ return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+static void __exit nf_defrag_fini(void)
+{
+ nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+void nf_defrag_ipv4_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000..e7ad950cf
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,161 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Based on code from ebt_log from:
+ *
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+static void dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(1) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (ap == NULL) {
+ nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+static void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+ dump_arp_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_arp_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_arp_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_arp_logger);
+}
+
+static struct pernet_operations nf_log_arp_net_ops = {
+ .init = nf_log_arp_net_init,
+ .exit = nf_log_arp_net_exit,
+};
+
+static int __init nf_log_arp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_arp_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ nf_log_unregister(&nf_arp_logger);
+}
+
+module_init(nf_log_arp_init);
+module_exit(nf_log_arp_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter ARP packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000..076aadda0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,397 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv4_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & XT_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ const unsigned char *op;
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ optsize, _opt);
+ if (op == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ struct icmphdr _icmph;
+ const struct icmphdr *ich;
+ static const size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (ich == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(m, info, skb,
+ iphoff + ih->ihl*4+sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_esph), &_esph);
+ if (eh == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static void dump_ipv4_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ nf_log_buf_add(m, ":%02x", *p);
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv4_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv4_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_ipv4_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+}
+
+static struct pernet_operations nf_log_ipv4_net_ops = {
+ .init = nf_log_ipv4_net_init,
+ .exit = nf_log_ipv4_net_exit,
+};
+
+static int __init nf_log_ipv4_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+}
+
+module_init(nf_log_ipv4_init);
+module_exit(nf_log_ipv4_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
new file mode 100644
index 000000000..574f7ebba
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -0,0 +1,631 @@
+/*
+ * H.323 extension for NAT alteration.
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 NAT module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/****************************************************************************/
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ unsigned int addroff, __be32 ip, __be16 port)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct {
+ __be32 ip;
+ __be16 port;
+ } __attribute__ ((__packed__)) buf;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+
+ buf.ip = ip;
+ buf.port = port;
+ addroff += dataoff;
+
+ if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ protoff, addroff, sizeof(buf),
+ (char *) &buf, sizeof(buf))) {
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
+ return -1;
+ }
+
+ /* Relocate data pointer */
+ th = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return -1;
+ *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
+ } else {
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ protoff, addroff, sizeof(buf),
+ (char *) &buf, sizeof(buf))) {
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
+ return -1;
+ }
+ /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
+ * or pull everything in a linear buffer, so we can safely
+ * use the skb pointers now */
+ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+{
+ return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
+ addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+{
+ return set_addr(skb, protoff, data, dataoff,
+ taddr->unicastAddress.iPAddress.network,
+ addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
+ if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+ port == info->sig_port[dir]) {
+ /* GW->GK */
+
+ /* Fix for Gnomemeeting */
+ if (i > 0 &&
+ get_h225_addr(ct, *data, &taddr[0],
+ &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
+ i = 0;
+
+ pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, port,
+ &ct->tuplehash[!dir].tuple.dst.u3.ip,
+ info->sig_port[!dir]);
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
+ &ct->tuplehash[!dir].
+ tuple.dst.u3,
+ info->sig_port[!dir]);
+ } else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
+ port == info->sig_port[dir]) {
+ /* GK->GW */
+ pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, port,
+ &ct->tuplehash[!dir].tuple.src.u3.ip,
+ info->sig_port[!dir]);
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
+ &ct->tuplehash[!dir].
+ tuple.src.u3,
+ info->sig_port[!dir]);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+ addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+ port == ct->tuplehash[dir].tuple.src.u.udp.port) {
+ pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, ntohs(port),
+ &ct->tuplehash[!dir].tuple.dst.u3.ip,
+ ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
+ return set_h225_addr(skb, protoff, data, 0, &taddr[i],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.
+ dst.u.udp.port);
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ __be16 port, __be16 rtp_port,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ u_int16_t nated_port;
+
+ /* Set expectations for NAT */
+ rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+ rtp_exp->expectfn = nf_nat_follow_master;
+ rtp_exp->dir = !dir;
+ rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+ rtcp_exp->expectfn = nf_nat_follow_master;
+ rtcp_exp->dir = !dir;
+
+ /* Lookup existing expects */
+ for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
+ if (info->rtp_port[i][dir] == rtp_port) {
+ /* Expected */
+
+ /* Use allocated ports first. This will refresh
+ * the expects */
+ rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir];
+ rtcp_exp->tuple.dst.u.udp.port =
+ htons(ntohs(info->rtp_port[i][dir]) + 1);
+ break;
+ } else if (info->rtp_port[i][dir] == 0) {
+ /* Not expected */
+ break;
+ }
+ }
+
+ /* Run out of expectations */
+ if (i >= H323_RTP_CHANNEL_MAX) {
+ net_notice_ratelimited("nf_nat_h323: out of expectations\n");
+ return 0;
+ }
+
+ /* Try to get a pair of ports. */
+ for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+ nated_port != 0; nated_port += 2) {
+ int ret;
+
+ rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == 0) {
+ rtcp_exp->tuple.dst.u.udp.port =
+ htons(nated_port + 1);
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
+ nf_ct_unexpect_related(rtp_exp);
+ nated_port = 0;
+ break;
+ }
+ } else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons((port & htons(1)) ? nated_port + 1 :
+ nated_port)) == 0) {
+ /* Save ports */
+ info->rtp_port[i][dir] = rtp_port;
+ info->rtp_port[i][!dir] = htons(nated_port);
+ } else {
+ nf_ct_unexpect_related(rtp_exp);
+ nf_ct_unexpect_related(rtcp_exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
+ &rtp_exp->tuple.src.u3.ip,
+ ntohs(rtp_exp->tuple.src.u.udp.port),
+ &rtp_exp->tuple.dst.u3.ip,
+ ntohs(rtp_exp->tuple.dst.u.udp.port));
+ pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
+ &rtcp_exp->tuple.src.u3.ip,
+ ntohs(rtcp_exp->tuple.src.u.udp.port),
+ &rtcp_exp->tuple.dst.u3.ip,
+ ntohs(rtcp_exp->tuple.dst.u.udp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = nf_nat_follow_master;
+ exp->dir = !dir;
+
+ /* Try to get same port: if not, try to change it. */
+ for (; nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) < 0) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = nf_nat_follow_master;
+ exp->dir = !dir;
+
+ /* Check existing expects */
+ if (info->sig_port[dir] == port)
+ nated_port = ntohs(info->sig_port[!dir]);
+
+ /* Try to get same port: if not, try to change it. */
+ for (; nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) == 0) {
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+ } else {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces nf_conntrack_q931_expect()
+ * which was set by nf_conntrack_h323.c.
+ ****************************************************************************/
+static void ip_nat_q931_expect(struct nf_conn *new,
+ struct nf_conntrack_expect *this)
+{
+ struct nf_nat_range range;
+
+ if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
+ nf_nat_follow_master(new, this);
+ return;
+ }
+
+ /* This must be a fresh one. */
+ BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr =
+ new->master->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int idx,
+ __be16 port, struct nf_conntrack_expect *exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+ union nf_inet_addr addr;
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = ip_nat_q931_expect;
+ exp->dir = !dir;
+
+ /* Check existing expects */
+ if (info->sig_port[dir] == port)
+ nated_port = ntohs(info->sig_port[!dir]);
+
+ /* Try to get same port: if not, try to change it. */
+ for (; nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) == 0) {
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+
+ /* Fix for Gnomemeeting */
+ if (idx > 0 &&
+ get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
+ set_h225_addr(skb, protoff, data, 0, &taddr[0],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
+ }
+ } else {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct nf_conn *new,
+ struct nf_conntrack_expect *this)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr = this->saved_addr;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port;
+
+ /* Set expectations for NAT */
+ exp->saved_addr = exp->tuple.dst.u3;
+ exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = ip_nat_callforwarding_expect;
+ exp->dir = !dir;
+
+ /* Try to get same port: if not, try to change it. */
+ for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) == 0) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+static struct nf_ct_helper_expectfn q931_nat = {
+ .name = "Q.931",
+ .expectfn = ip_nat_q931_expect,
+};
+
+static struct nf_ct_helper_expectfn callforwarding_nat = {
+ .name = "callforwarding",
+ .expectfn = ip_nat_callforwarding_expect,
+};
+
+/****************************************************************************/
+static int __init init(void)
+{
+ BUG_ON(set_h245_addr_hook != NULL);
+ BUG_ON(set_h225_addr_hook != NULL);
+ BUG_ON(set_sig_addr_hook != NULL);
+ BUG_ON(set_ras_addr_hook != NULL);
+ BUG_ON(nat_rtp_rtcp_hook != NULL);
+ BUG_ON(nat_t120_hook != NULL);
+ BUG_ON(nat_h245_hook != NULL);
+ BUG_ON(nat_callforwarding_hook != NULL);
+ BUG_ON(nat_q931_hook != NULL);
+
+ RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
+ RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
+ RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
+ RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
+ RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
+ RCU_INIT_POINTER(nat_t120_hook, nat_t120);
+ RCU_INIT_POINTER(nat_h245_hook, nat_h245);
+ RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
+ RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+ nf_ct_helper_expectfn_register(&q931_nat);
+ nf_ct_helper_expectfn_register(&callforwarding_nat);
+ return 0;
+}
+
+/****************************************************************************/
+static void __exit fini(void)
+{
+ RCU_INIT_POINTER(set_h245_addr_hook, NULL);
+ RCU_INIT_POINTER(set_h225_addr_hook, NULL);
+ RCU_INIT_POINTER(set_sig_addr_hook, NULL);
+ RCU_INIT_POINTER(set_ras_addr_hook, NULL);
+ RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
+ RCU_INIT_POINTER(nat_t120_hook, NULL);
+ RCU_INIT_POINTER(nat_h245_hook, NULL);
+ RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
+ RCU_INIT_POINTER(nat_q931_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&q931_nat);
+ nf_ct_helper_expectfn_unregister(&callforwarding_nat);
+ synchronize_rcu();
+}
+
+/****************************************************************************/
+module_init(init);
+module_exit(fini);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_h323");
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 000000000..e59cc05c0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,481 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
+
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
+{
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi4 *fl4 = &fl->u.ip4;
+
+ if (ct->status & statusbit) {
+ fl4->daddr = t->dst.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl4->saddr = t->src.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_sport = t->src.u.all;
+ }
+}
+#endif /* CONFIG_XFRM */
+
+static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
+ const struct nf_nat_range *range)
+{
+ return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
+ ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
+}
+
+static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
+ __be16 dport)
+{
+ return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
+}
+
+static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph;
+ unsigned int hdroff;
+
+ if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+ return false;
+
+ iph = (void *)skb->data + iphdroff;
+ hdroff = iphdroff + iph->ihl * 4;
+
+ if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
+ target, maniptype))
+ return false;
+ iph = (void *)skb->data + iphdroff;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+ iph->saddr = target->src.u3.ip;
+ } else {
+ csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+ iph->daddr = target->dst.u3.ip;
+ }
+ return true;
+}
+
+static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+ __be32 oldip, newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = iph->saddr;
+ newip = t->src.u3.ip;
+ } else {
+ oldip = iph->daddr;
+ newip = t->dst.u3.ip;
+ }
+ inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+}
+
+static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt_flags & RTCF_LOCAL) &&
+ (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ ip_hdrlen(skb);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, proto, 0);
+ } else {
+ *check = 0;
+ *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, proto,
+ csum_partial(data, datalen,
+ 0));
+ if (proto == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_NAT_V4_MINIP]) {
+ range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (tb[CTA_NAT_V4_MAXIP])
+ range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
+ else
+ range->max_addr.ip = range->min_addr.ip;
+
+ return 0;
+}
+#endif
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
+ .l3proto = NFPROTO_IPV4,
+ .in_range = nf_nat_ipv4_in_range,
+ .secure_port = nf_nat_ipv4_secure_port,
+ .manip_pkt = nf_nat_ipv4_manip_pkt,
+ .csum_update = nf_nat_ipv4_csum_update,
+ .csum_recalc = nf_nat_ipv4_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
+#endif
+#ifdef CONFIG_XFRM
+ .decode_session = nf_nat_ipv4_decode_session,
+#endif
+};
+
+int nf_nat_icmp_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum)
+{
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ unsigned int hdrlen = ip_hdrlen(skb);
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+
+ if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp.type == ICMP_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
+ if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
+ l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ /* Reloading "inside" here since manip_pkt may reallocate */
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp.checksum = 0;
+ inside->icmp.checksum =
+ csum_fold(skb_checksum(skb, hdrlen,
+ skb->len - hdrlen, 0));
+ }
+
+ /* Change outer to look like the reply to an incoming packet */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
+ if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+
+unsigned int
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ /* maniptype == SRC for postrouting. */
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+
+ /* We never see fragments: conntrack defrags on pre-routing
+ * and local-out, and nf_nat_out protects post-routing.
+ */
+ NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ /* Don't try to NAT if this packet is not conntracked */
+ if (nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ ops->hooknum))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ ret = do_chain(ops, skb, state, ct);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ break;
+
+ ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
+ state->out))
+ goto oif_changed;
+ }
+ break;
+
+ default:
+ /* ESTABLISHED */
+ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ goto oif_changed;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
+
+unsigned int
+nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ unsigned int ret;
+ __be32 daddr = ip_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ daddr != ip_hdr(skb)->daddr)
+ skb_dst_drop(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
+
+unsigned int
+nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
+
+unsigned int
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
+static int __init nf_nat_l3proto_ipv4_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
+ if (err < 0)
+ goto err2;
+ return err;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_l3proto_ipv4_exit(void)
+{
+ nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
+
+module_init(nf_nat_l3proto_ipv4_init);
+module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
new file mode 100644
index 000000000..c6eb42100
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -0,0 +1,153 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+unsigned int
+nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
+ const struct nf_nat_range *range,
+ const struct net_device *out)
+{
+ struct nf_conn *ct;
+ struct nf_conn_nat *nat;
+ enum ip_conntrack_info ctinfo;
+ struct nf_nat_range newrange;
+ const struct rtable *rt;
+ __be32 newsrc, nh;
+
+ NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ nat = nfct_nat(ct);
+
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ /* Source address is 0.0.0.0 - locally generated packet that is
+ * probably not supposed to be masqueraded.
+ */
+ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
+ return NF_ACCEPT;
+
+ rt = skb_rtable(skb);
+ nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
+ if (!newsrc) {
+ pr_info("%s ate my IP address\n", out->name);
+ return NF_DROP;
+ }
+
+ nat->masq_index = out->ifindex;
+
+ /* Transfer from original range. */
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newsrc;
+ newrange.max_addr.ip = newsrc;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
+
+static int device_cmp(struct nf_conn *i, void *ifindex)
+{
+ const struct nf_conn_nat *nat = nfct_nat(i);
+
+ if (!nat)
+ return 0;
+ if (nf_ct_l3num(i) != NFPROTO_IPV4)
+ return 0;
+ return nat->masq_index == (int)(long)ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN) {
+ /* Device was downed. Search entire table for
+ * conntracks which were associated with that device,
+ * and forget them.
+ */
+ NF_CT_ASSERT(dev->ifindex != 0);
+
+ nf_ct_iterate_cleanup(net, device_cmp,
+ (void *)(long)dev->ifindex, 0, 0);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return masq_device_event(this, event, &info);
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+
+void nf_nat_masquerade_ipv4_register_notifier(void)
+{
+ /* check if the notifier was already set */
+ if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
+ return;
+
+ /* Register for device down reports */
+ register_netdevice_notifier(&masq_dev_notifier);
+ /* Register IP address change reports */
+ register_inetaddr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
+
+void nf_nat_masquerade_ipv4_unregister_notifier(void)
+{
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
+ return;
+
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
new file mode 100644
index 000000000..657d2307f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -0,0 +1,311 @@
+/*
+ * nf_nat_pptp.c
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * (needs netfilter tuple reservation)
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_NAT_PPTP_VERSION "3.0"
+
+#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off)))
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+MODULE_ALIAS("ip_nat_pptp");
+
+static void pptp_nat_expected(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct net *net = nf_ct_net(ct);
+ const struct nf_conn *master = ct->master;
+ struct nf_conntrack_expect *other_exp;
+ struct nf_conntrack_tuple t;
+ const struct nf_ct_pptp_master *ct_pptp_info;
+ const struct nf_nat_pptp *nat_pptp_info;
+ struct nf_nat_range range;
+
+ ct_pptp_info = nfct_help_data(master);
+ nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
+
+ /* And here goes the grand finale of corrosion... */
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ pr_debug("we are PNS->PAC\n");
+ /* therefore, build tuple for PAC->PNS */
+ t.src.l3num = AF_INET;
+ t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ t.src.u.gre.key = ct_pptp_info->pac_call_id;
+ t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ t.dst.protonum = IPPROTO_GRE;
+ } else {
+ pr_debug("we are PAC->PNS\n");
+ /* build tuple for PNS->PAC */
+ t.src.l3num = AF_INET;
+ t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ t.src.u.gre.key = nat_pptp_info->pns_call_id;
+ t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ t.dst.u.gre.key = nat_pptp_info->pac_call_id;
+ t.dst.protonum = IPPROTO_GRE;
+ }
+
+ pr_debug("trying to unexpect other dir: ");
+ nf_ct_dump_tuple_ip(&t);
+ other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
+ if (other_exp) {
+ nf_ct_unexpect_related(other_exp);
+ nf_ct_expect_put(other_exp);
+ pr_debug("success\n");
+ } else {
+ pr_debug("not found!\n");
+ }
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
+ }
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+ if (exp->dir == IP_CT_DIR_REPLY) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
+ }
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+
+{
+ struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_nat_pptp *nat_pptp_info;
+ u_int16_t msg;
+ __be16 new_callid;
+ unsigned int cid_off;
+
+ ct_pptp_info = nfct_help_data(ct);
+ nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+
+ new_callid = ct_pptp_info->pns_call_id;
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REQUEST:
+ cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
+ /* FIXME: ideally we would want to reserve a call ID
+ * here. current netfilter NAT core is not able to do
+ * this :( For now we use TCP source port. This breaks
+ * multiple calls within one control session */
+
+ /* save original call ID in nat_info */
+ nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+ /* don't use tcph->source since we are at a DSTmanip
+ * hook (e.g. PREROUTING) and pkt is not mangled yet */
+ new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ /* save new call ID in ct info */
+ ct_pptp_info->pns_call_id = new_callid;
+ break;
+ case PPTP_IN_CALL_REPLY:
+ cid_off = offsetof(union pptp_ctrl_union, icack.callID);
+ break;
+ case PPTP_CALL_CLEAR_REQUEST:
+ cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
+ break;
+ default:
+ pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+ pptp_msg_name[0]);
+ /* fall through */
+ case PPTP_SET_LINK_INFO:
+ /* only need to NAT in case PAC is behind NAT box */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+ * down to here */
+ pr_debug("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
+
+ /* mangle packet */
+ if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ cid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_callid), (char *)&new_callid,
+ sizeof(new_callid)) == 0)
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+
+static void
+pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
+ struct nf_conntrack_expect *expect_reply)
+{
+ const struct nf_conn *ct = expect_orig->master;
+ struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_nat_pptp *nat_pptp_info;
+
+ ct_pptp_info = nfct_help_data(ct);
+ nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+
+ /* save original PAC call ID in nat_info */
+ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+ /* alter expectation for PNS->PAC direction */
+ expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
+ expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
+ expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
+ expect_orig->dir = IP_CT_DIR_ORIGINAL;
+
+ /* alter expectation for PAC->PNS direction */
+ expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
+ expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
+ expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ expect_reply->dir = IP_CT_DIR_REPLY;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+{
+ const struct nf_nat_pptp *nat_pptp_info;
+ u_int16_t msg;
+ __be16 new_pcid;
+ unsigned int pcid_off;
+
+ nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+ new_pcid = nat_pptp_info->pns_call_id;
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REPLY:
+ pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
+ break;
+ case PPTP_IN_CALL_CONNECT:
+ pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
+ break;
+ case PPTP_IN_CALL_REQUEST:
+ /* only need to nat in case PAC is behind NAT box */
+ return NF_ACCEPT;
+ case PPTP_WAN_ERROR_NOTIFY:
+ pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
+ break;
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
+ break;
+ case PPTP_SET_LINK_INFO:
+ pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
+ break;
+ default:
+ pr_debug("unknown inbound packet %s\n",
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+ pptp_msg_name[0]);
+ /* fall through */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+ * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+ /* mangle packet */
+ pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
+ ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
+
+ if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ pcid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_pcid), (char *)&new_pcid,
+ sizeof(new_pcid)) == 0)
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+
+static int __init nf_nat_helper_pptp_init(void)
+{
+ nf_nat_need_gre();
+
+ BUG_ON(nf_nat_pptp_hook_outbound != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
+
+ BUG_ON(nf_nat_pptp_hook_inbound != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
+
+ BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+
+ BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
+ return 0;
+}
+
+static void __exit nf_nat_helper_pptp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+ synchronize_rcu();
+}
+
+module_init(nf_nat_helper_pptp_init);
+module_exit(nf_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
new file mode 100644
index 000000000..9414923f1
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -0,0 +1,149 @@
+/*
+ * nf_nat_proto_gre.c
+ *
+ * NAT protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
+
+/* generate unique tuple ... */
+static void
+gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ static u_int16_t key;
+ __be16 *keyptr;
+ unsigned int min, i, range_size;
+
+ /* If there is no master conntrack we are not PPTP,
+ do not change tuples */
+ if (!ct->master)
+ return;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ keyptr = &tuple->src.u.gre.key;
+ else
+ keyptr = &tuple->dst.u.gre.key;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ pr_debug("%p: NATing GRE PPTP\n", ct);
+ min = 1;
+ range_size = 0xffff;
+ } else {
+ min = ntohs(range->min_proto.gre.key);
+ range_size = ntohs(range->max_proto.gre.key) - min + 1;
+ }
+
+ pr_debug("min = %u, range_size = %u\n", min, range_size);
+
+ for (i = 0; ; ++key) {
+ *keyptr = htons(min + key % range_size);
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+
+ pr_debug("%p: no NAT mapping\n", ct);
+ return;
+}
+
+/* manipulate a GRE packet according to maniptype */
+static bool
+gre_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct gre_hdr *greh;
+ struct gre_hdr_pptp *pgreh;
+
+ /* pgreh includes two optional 32bit fields which are not required
+ * to be there. That's where the magic '8' comes from */
+ if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+ return false;
+
+ greh = (void *)skb->data + hdroff;
+ pgreh = (struct gre_hdr_pptp *)greh;
+
+ /* we only have destination manip of a packet, since 'source key'
+ * is not present in the packet itself */
+ if (maniptype != NF_NAT_MANIP_DST)
+ return true;
+ switch (greh->version) {
+ case GRE_VERSION_1701:
+ /* We do not currently NAT any GREv0 packets.
+ * Try to behave like "nf_nat_proto_unknown" */
+ break;
+ case GRE_VERSION_PPTP:
+ pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
+ pgreh->call_id = tuple->dst.u.gre.key;
+ break;
+ default:
+ pr_debug("can't nat unknown GRE version\n");
+ return false;
+ }
+ return true;
+}
+
+static const struct nf_nat_l4proto gre = {
+ .l4proto = IPPROTO_GRE,
+ .manip_pkt = gre_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = gre_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_gre_init(void)
+{
+ return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
+}
+
+static void __exit nf_nat_proto_gre_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
+}
+
+module_init(nf_nat_proto_gre_init);
+module_exit(nf_nat_proto_gre_fini);
+
+void nf_nat_need_gre(void)
+{
+ return;
+}
+EXPORT_SYMBOL_GPL(nf_nat_need_gre);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
new file mode 100644
index 000000000..4557b4ab8
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -0,0 +1,83 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool
+icmp_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
+ ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
+}
+
+static void
+icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ static u_int16_t id;
+ unsigned int range_size;
+ unsigned int i;
+
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+ /* If no range specified... */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
+ range_size = 0xFFFF;
+
+ for (i = 0; ; ++id) {
+ tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
+ (id % range_size));
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+ return;
+}
+
+static bool
+icmp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct icmphdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct icmphdr *)(skb->data + hdroff);
+ inet_proto_csum_replace2(&hdr->checksum, skb,
+ hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+ hdr->un.echo.id = tuple->src.u.icmp.id;
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
+ .l4proto = IPPROTO_ICMP,
+ .manip_pkt = icmp_manip_pkt,
+ .in_range = icmp_in_range,
+ .unique_tuple = icmp_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
new file mode 100644
index 000000000..7c6766713
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -0,0 +1,1313 @@
+/*
+ * nf_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified. The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+MODULE_ALIAS("ip_nat_snmp_basic");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+#define NOCT1(n) (*(u8 *)(n))
+
+static int debug;
+static DEFINE_SPINLOCK(snmp_lock);
+
+/*
+ * Application layer address mapping mimics the NAT mapping, but
+ * only for the first octet in this case (a more flexible system
+ * can be implemented if needed).
+ */
+struct oct1_map
+{
+ u_int8_t from;
+ u_int8_t to;
+};
+
+
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI 0 /* Universal */
+#define ASN1_APL 1 /* Application */
+#define ASN1_CTX 2 /* Context */
+#define ASN1_PRV 3 /* Private */
+
+/* Tag */
+#define ASN1_EOC 0 /* End Of Contents */
+#define ASN1_BOL 1 /* Boolean */
+#define ASN1_INT 2 /* Integer */
+#define ASN1_BTS 3 /* Bit String */
+#define ASN1_OTS 4 /* Octet String */
+#define ASN1_NUL 5 /* Null */
+#define ASN1_OJI 6 /* Object Identifier */
+#define ASN1_OJD 7 /* Object Description */
+#define ASN1_EXT 8 /* External */
+#define ASN1_SEQ 16 /* Sequence */
+#define ASN1_SET 17 /* Set */
+#define ASN1_NUMSTR 18 /* Numerical String */
+#define ASN1_PRNSTR 19 /* Printable String */
+#define ASN1_TEXSTR 20 /* Teletext String */
+#define ASN1_VIDSTR 21 /* Video String */
+#define ASN1_IA5STR 22 /* IA5 String */
+#define ASN1_UNITIM 23 /* Universal Time */
+#define ASN1_GENTIM 24 /* General Time */
+#define ASN1_GRASTR 25 /* Graphical String */
+#define ASN1_VISSTR 26 /* Visible String */
+#define ASN1_GENSTR 27 /* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI 0 /* Primitive */
+#define ASN1_CON 1 /* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR 0
+#define ASN1_ERR_DEC_EMPTY 2
+#define ASN1_ERR_DEC_EOC_MISMATCH 3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
+#define ASN1_ERR_DEC_BADVALUE 5
+
+/*
+ * ASN.1 context.
+ */
+struct asn1_ctx
+{
+ int error; /* Error condition */
+ unsigned char *pointer; /* Octet just to be decoded */
+ unsigned char *begin; /* First octet */
+ unsigned char *end; /* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr
+{
+ unsigned char *data;
+ unsigned int len;
+};
+
+static void asn1_open(struct asn1_ctx *ctx,
+ unsigned char *buf,
+ unsigned int len)
+{
+ ctx->begin = buf;
+ ctx->end = buf + len;
+ ctx->pointer = buf;
+ ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+ if (ctx->pointer >= ctx->end) {
+ ctx->error = ASN1_ERR_DEC_EMPTY;
+ return 0;
+ }
+ *ch = *(ctx->pointer)++;
+ return 1;
+}
+
+static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+ unsigned char ch;
+
+ *tag = 0;
+
+ do
+ {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+ *tag <<= 7;
+ *tag |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+ return 1;
+}
+
+static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
+ unsigned int *cls,
+ unsigned int *con,
+ unsigned int *tag)
+{
+ unsigned char ch;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *cls = (ch & 0xC0) >> 6;
+ *con = (ch & 0x20) >> 5;
+ *tag = (ch & 0x1F);
+
+ if (*tag == 0x1F) {
+ if (!asn1_tag_decode(ctx, tag))
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
+ unsigned int *def,
+ unsigned int *len)
+{
+ unsigned char ch, cnt;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch == 0x80)
+ *def = 0;
+ else {
+ *def = 1;
+
+ if (ch < 0x80)
+ *len = ch;
+ else {
+ cnt = ch & 0x7F;
+ *len = 0;
+
+ while (cnt > 0) {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+ *len <<= 8;
+ *len |= ch;
+ cnt--;
+ }
+ }
+ }
+
+ /* don't trust len bigger than ctx buffer */
+ if (*len > ctx->end - ctx->pointer)
+ return 0;
+
+ return 1;
+}
+
+static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
+ unsigned char **eoc,
+ unsigned int *cls,
+ unsigned int *con,
+ unsigned int *tag)
+{
+ unsigned int def, len;
+
+ if (!asn1_id_decode(ctx, cls, con, tag))
+ return 0;
+
+ def = len = 0;
+ if (!asn1_length_decode(ctx, &def, &len))
+ return 0;
+
+ /* primitive shall be definite, indefinite shall be constructed */
+ if (*con == ASN1_PRI && !def)
+ return 0;
+
+ if (def)
+ *eoc = ctx->pointer + len;
+ else
+ *eoc = NULL;
+ return 1;
+}
+
+static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+ unsigned char ch;
+
+ if (eoc == NULL) {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch != 0x00) {
+ ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch != 0x00) {
+ ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+ return 0;
+ }
+ return 1;
+ } else {
+ if (ctx->pointer != eoc) {
+ ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+ return 0;
+ }
+ return 1;
+ }
+}
+
+static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+ ctx->pointer = eoc;
+ return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ long *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = (signed char) ch;
+ len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (long)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned int *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = ch;
+ if (ch == 0) len = 0;
+ else len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (unsigned int)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned long *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = ch;
+ if (ch == 0) len = 0;
+ else len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (unsigned long)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned char **octets,
+ unsigned int *len)
+{
+ unsigned char *ptr;
+
+ *len = 0;
+
+ *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+ if (*octets == NULL)
+ return 0;
+
+ ptr = *octets;
+ while (ctx->pointer < eoc) {
+ if (!asn1_octet_decode(ctx, ptr++)) {
+ kfree(*octets);
+ *octets = NULL;
+ return 0;
+ }
+ (*len)++;
+ }
+ return 1;
+}
+
+static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
+ unsigned long *subid)
+{
+ unsigned char ch;
+
+ *subid = 0;
+
+ do {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *subid <<= 7;
+ *subid |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+ return 1;
+}
+
+static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned long **oid,
+ unsigned int *len)
+{
+ unsigned long subid;
+ unsigned long *optr;
+ size_t size;
+
+ size = eoc - ctx->pointer + 1;
+
+ /* first subid actually encodes first two subids */
+ if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+ return 0;
+
+ *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+ if (*oid == NULL)
+ return 0;
+
+ optr = *oid;
+
+ if (!asn1_subid_decode(ctx, &subid)) {
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+
+ if (subid < 40) {
+ optr[0] = 0;
+ optr[1] = subid;
+ } else if (subid < 80) {
+ optr[0] = 1;
+ optr[1] = subid - 40;
+ } else {
+ optr[0] = 2;
+ optr[1] = subid - 80;
+ }
+
+ *len = 2;
+ optr += 2;
+
+ while (ctx->pointer < eoc) {
+ if (++(*len) > size) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+
+ if (!asn1_subid_decode(ctx, optr++)) {
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * SNMP decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* SNMP Versions */
+#define SNMP_V1 0
+#define SNMP_V2C 1
+#define SNMP_V2 2
+#define SNMP_V3 3
+
+/* Default Sizes */
+#define SNMP_SIZE_COMM 256
+#define SNMP_SIZE_OBJECTID 128
+#define SNMP_SIZE_BUFCHR 256
+#define SNMP_SIZE_BUFINT 128
+#define SNMP_SIZE_SMALLOBJECTID 16
+
+/* Requests */
+#define SNMP_PDU_GET 0
+#define SNMP_PDU_NEXT 1
+#define SNMP_PDU_RESPONSE 2
+#define SNMP_PDU_SET 3
+#define SNMP_PDU_TRAP1 4
+#define SNMP_PDU_BULK 5
+#define SNMP_PDU_INFORM 6
+#define SNMP_PDU_TRAP2 7
+
+/* Errors */
+#define SNMP_NOERROR 0
+#define SNMP_TOOBIG 1
+#define SNMP_NOSUCHNAME 2
+#define SNMP_BADVALUE 3
+#define SNMP_READONLY 4
+#define SNMP_GENERROR 5
+#define SNMP_NOACCESS 6
+#define SNMP_WRONGTYPE 7
+#define SNMP_WRONGLENGTH 8
+#define SNMP_WRONGENCODING 9
+#define SNMP_WRONGVALUE 10
+#define SNMP_NOCREATION 11
+#define SNMP_INCONSISTENTVALUE 12
+#define SNMP_RESOURCEUNAVAILABLE 13
+#define SNMP_COMMITFAILED 14
+#define SNMP_UNDOFAILED 15
+#define SNMP_AUTHORIZATIONERROR 16
+#define SNMP_NOTWRITABLE 17
+#define SNMP_INCONSISTENTNAME 18
+
+/* General SNMP V1 Traps */
+#define SNMP_TRAP_COLDSTART 0
+#define SNMP_TRAP_WARMSTART 1
+#define SNMP_TRAP_LINKDOWN 2
+#define SNMP_TRAP_LINKUP 3
+#define SNMP_TRAP_AUTFAILURE 4
+#define SNMP_TRAP_EQPNEIGHBORLOSS 5
+#define SNMP_TRAP_ENTSPECIFIC 6
+
+/* SNMPv1 Types */
+#define SNMP_NULL 0
+#define SNMP_INTEGER 1 /* l */
+#define SNMP_OCTETSTR 2 /* c */
+#define SNMP_DISPLAYSTR 2 /* c */
+#define SNMP_OBJECTID 3 /* ul */
+#define SNMP_IPADDR 4 /* uc */
+#define SNMP_COUNTER 5 /* ul */
+#define SNMP_GAUGE 6 /* ul */
+#define SNMP_TIMETICKS 7 /* ul */
+#define SNMP_OPAQUE 8 /* c */
+
+/* Additional SNMPv2 Types */
+#define SNMP_UINTEGER 5 /* ul */
+#define SNMP_BITSTR 9 /* uc */
+#define SNMP_NSAP 10 /* uc */
+#define SNMP_COUNTER64 11 /* ul */
+#define SNMP_NOSUCHOBJECT 12
+#define SNMP_NOSUCHINSTANCE 13
+#define SNMP_ENDOFMIBVIEW 14
+
+union snmp_syntax
+{
+ unsigned char uc[0]; /* 8 bit unsigned */
+ char c[0]; /* 8 bit signed */
+ unsigned long ul[0]; /* 32 bit unsigned */
+ long l[0]; /* 32 bit signed */
+};
+
+struct snmp_object
+{
+ unsigned long *id;
+ unsigned int id_len;
+ unsigned short type;
+ unsigned int syntax_len;
+ union snmp_syntax syntax;
+};
+
+struct snmp_request
+{
+ unsigned long id;
+ unsigned int error_status;
+ unsigned int error_index;
+};
+
+struct snmp_v1_trap
+{
+ unsigned long *id;
+ unsigned int id_len;
+ unsigned long ip_address; /* pointer */
+ unsigned int general;
+ unsigned int specific;
+ unsigned long time;
+};
+
+/* SNMP types */
+#define SNMP_IPA 0
+#define SNMP_CNT 1
+#define SNMP_GGE 2
+#define SNMP_TIT 3
+#define SNMP_OPQ 4
+#define SNMP_C64 6
+
+/* SNMP errors */
+#define SERR_NSO 0
+#define SERR_NSI 1
+#define SERR_EOM 2
+
+static inline void mangle_address(unsigned char *begin,
+ unsigned char *addr,
+ const struct oct1_map *map,
+ __sum16 *check);
+struct snmp_cnv
+{
+ unsigned int class;
+ unsigned int tag;
+ int syntax;
+};
+
+static const struct snmp_cnv snmp_conv[] = {
+ {ASN1_UNI, ASN1_NUL, SNMP_NULL},
+ {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
+ {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
+ {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
+ {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
+ {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
+ {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */
+ {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */
+ {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
+ {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
+
+ /* SNMPv2 data types and errors */
+ {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
+ {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
+ {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
+ {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
+ {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
+ {0, 0, -1}
+};
+
+static unsigned char snmp_tag_cls2syntax(unsigned int tag,
+ unsigned int cls,
+ unsigned short *syntax)
+{
+ const struct snmp_cnv *cnv;
+
+ cnv = snmp_conv;
+
+ while (cnv->syntax != -1) {
+ if (cnv->tag == tag && cnv->class == cls) {
+ *syntax = cnv->syntax;
+ return 1;
+ }
+ cnv++;
+ }
+ return 0;
+}
+
+static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
+ struct snmp_object **obj)
+{
+ unsigned int cls, con, tag, len, idlen;
+ unsigned short type;
+ unsigned char *eoc, *end, *p;
+ unsigned long *lp, *id;
+ unsigned long ul;
+ long l;
+
+ *obj = NULL;
+ id = NULL;
+
+ if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+ return 0;
+
+ if (!asn1_oid_decode(ctx, end, &id, &idlen))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
+ kfree(id);
+ return 0;
+ }
+
+ if (con != ASN1_PRI) {
+ kfree(id);
+ return 0;
+ }
+
+ type = 0;
+ if (!snmp_tag_cls2syntax(tag, cls, &type)) {
+ kfree(id);
+ return 0;
+ }
+
+ l = 0;
+ switch (type) {
+ case SNMP_INTEGER:
+ len = sizeof(long);
+ if (!asn1_long_decode(ctx, end, &l)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ (*obj)->syntax.l[0] = l;
+ break;
+ case SNMP_OCTETSTR:
+ case SNMP_OPAQUE:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.c, p, len);
+ kfree(p);
+ break;
+ case SNMP_NULL:
+ case SNMP_NOSUCHOBJECT:
+ case SNMP_NOSUCHINSTANCE:
+ case SNMP_ENDOFMIBVIEW:
+ len = 0;
+ *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ if (!asn1_null_decode(ctx, end)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ break;
+ case SNMP_OBJECTID:
+ if (!asn1_oid_decode(ctx, end, &lp, &len)) {
+ kfree(id);
+ return 0;
+ }
+ len *= sizeof(unsigned long);
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(lp);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.ul, lp, len);
+ kfree(lp);
+ break;
+ case SNMP_IPADDR:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ if (len != 4) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.uc, p, len);
+ kfree(p);
+ break;
+ case SNMP_COUNTER:
+ case SNMP_GAUGE:
+ case SNMP_TIMETICKS:
+ len = sizeof(unsigned long);
+ if (!asn1_ulong_decode(ctx, end, &ul)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ (*obj)->syntax.ul[0] = ul;
+ break;
+ default:
+ kfree(id);
+ return 0;
+ }
+
+ (*obj)->syntax_len = len;
+ (*obj)->type = type;
+ (*obj)->id = id;
+ (*obj)->id_len = idlen;
+
+ if (!asn1_eoc_decode(ctx, eoc)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
+ struct snmp_request *request)
+{
+ unsigned int cls, con, tag;
+ unsigned char *end;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_ulong_decode(ctx, end, &request->id))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_uint_decode(ctx, end, &request->error_status))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_uint_decode(ctx, end, &request->error_index))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Fast checksum update for possibly oddly-aligned UDP byte, from the
+ * code example in the draft.
+ */
+static void fast_csum(__sum16 *csum,
+ const unsigned char *optr,
+ const unsigned char *nptr,
+ int offset)
+{
+ unsigned char s[4];
+
+ if (offset & 1) {
+ s[0] = ~0;
+ s[1] = ~*optr;
+ s[2] = 0;
+ s[3] = *nptr;
+ } else {
+ s[0] = ~*optr;
+ s[1] = ~0;
+ s[2] = *nptr;
+ s[3] = 0;
+ }
+
+ *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
+}
+
+/*
+ * Mangle IP address.
+ * - begin points to the start of the snmp messgae
+ * - addr points to the start of the address
+ */
+static inline void mangle_address(unsigned char *begin,
+ unsigned char *addr,
+ const struct oct1_map *map,
+ __sum16 *check)
+{
+ if (map->from == NOCT1(addr)) {
+ u_int32_t old;
+
+ if (debug)
+ memcpy(&old, addr, sizeof(old));
+
+ *addr = map->to;
+
+ /* Update UDP checksum if being used */
+ if (*check) {
+ fast_csum(check,
+ &map->from, &map->to, addr - begin);
+
+ }
+
+ if (debug)
+ printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n",
+ &old, addr);
+ }
+}
+
+static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
+ struct snmp_v1_trap *trap,
+ const struct oct1_map *map,
+ __sum16 *check)
+{
+ unsigned int cls, con, tag, len;
+ unsigned char *end;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+ return 0;
+
+ if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_id_free;
+
+ if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
+ (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
+ goto err_id_free;
+
+ if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
+ goto err_id_free;
+
+ /* IPv4 only */
+ if (len != 4)
+ goto err_addr_free;
+
+ mangle_address(ctx->begin, ctx->pointer - 4, map, check);
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ goto err_addr_free;
+
+ if (!asn1_uint_decode(ctx, end, &trap->general))
+ goto err_addr_free;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ goto err_addr_free;
+
+ if (!asn1_uint_decode(ctx, end, &trap->specific))
+ goto err_addr_free;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
+ (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
+ goto err_addr_free;
+
+ if (!asn1_ulong_decode(ctx, end, &trap->time))
+ goto err_addr_free;
+
+ return 1;
+
+err_addr_free:
+ kfree((unsigned long *)trap->ip_address);
+
+err_id_free:
+ kfree(trap->id);
+
+ return 0;
+}
+
+/*****************************************************************************
+ *
+ * Misc. routines
+ *
+ *****************************************************************************/
+
+static void hex_dump(const unsigned char *buf, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (i && !(i % 16))
+ printk("\n");
+ printk("%02x ", *(buf + i));
+ }
+ printk("\n");
+}
+
+/*
+ * Parse and mangle SNMP message according to mapping.
+ * (And this is the fucking 'basic' method).
+ */
+static int snmp_parse_mangle(unsigned char *msg,
+ u_int16_t len,
+ const struct oct1_map *map,
+ __sum16 *check)
+{
+ unsigned char *eoc, *end;
+ unsigned int cls, con, tag, vers, pdutype;
+ struct asn1_ctx ctx;
+ struct asn1_octstr comm;
+ struct snmp_object *obj;
+
+ if (debug > 1)
+ hex_dump(msg, len);
+
+ asn1_open(&ctx, msg, len);
+
+ /*
+ * Start of SNMP message.
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ /*
+ * Version 1 or 2 handled.
+ */
+ if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+ if (!asn1_uint_decode (&ctx, end, &vers))
+ return 0;
+ if (debug > 1)
+ printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+ if (vers > 1)
+ return 1;
+
+ /*
+ * Community.
+ */
+ if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
+ return 0;
+ if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
+ return 0;
+ if (debug > 1) {
+ unsigned int i;
+
+ printk(KERN_DEBUG "bsalg: community: ");
+ for (i = 0; i < comm.len; i++)
+ printk("%c", comm.data[i]);
+ printk("\n");
+ }
+ kfree(comm.data);
+
+ /*
+ * PDU type
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
+ return 0;
+ if (cls != ASN1_CTX || con != ASN1_CON)
+ return 0;
+ if (debug > 1) {
+ static const unsigned char *const pdus[] = {
+ [SNMP_PDU_GET] = "get",
+ [SNMP_PDU_NEXT] = "get-next",
+ [SNMP_PDU_RESPONSE] = "response",
+ [SNMP_PDU_SET] = "set",
+ [SNMP_PDU_TRAP1] = "trapv1",
+ [SNMP_PDU_BULK] = "bulk",
+ [SNMP_PDU_INFORM] = "inform",
+ [SNMP_PDU_TRAP2] = "trapv2"
+ };
+
+ if (pdutype > SNMP_PDU_TRAP2)
+ printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+ else
+ printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+ }
+ if (pdutype != SNMP_PDU_RESPONSE &&
+ pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
+ return 1;
+
+ /*
+ * Request header or v1 trap
+ */
+ if (pdutype == SNMP_PDU_TRAP1) {
+ struct snmp_v1_trap trap;
+ unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+
+ if (ret) {
+ kfree(trap.id);
+ kfree((unsigned long *)trap.ip_address);
+ } else
+ return ret;
+
+ } else {
+ struct snmp_request req;
+
+ if (!snmp_request_decode(&ctx, &req))
+ return 0;
+
+ if (debug > 1)
+ printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+ "error_index=%u\n", req.id, req.error_status,
+ req.error_index);
+ }
+
+ /*
+ * Loop through objects, look for IP addresses to mangle.
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ while (!asn1_eoc_decode(&ctx, eoc)) {
+ unsigned int i;
+
+ if (!snmp_object_decode(&ctx, &obj)) {
+ if (obj) {
+ kfree(obj->id);
+ kfree(obj);
+ }
+ return 0;
+ }
+
+ if (debug > 1) {
+ printk(KERN_DEBUG "bsalg: object: ");
+ for (i = 0; i < obj->id_len; i++) {
+ if (i > 0)
+ printk(".");
+ printk("%lu", obj->id[i]);
+ }
+ printk(": type=%u\n", obj->type);
+
+ }
+
+ if (obj->type == SNMP_IPADDR)
+ mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+
+ kfree(obj->id);
+ kfree(obj);
+ }
+
+ if (!asn1_eoc_decode(&ctx, eoc))
+ return 0;
+
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * NAT routines.
+ *
+ *****************************************************************************/
+
+/*
+ * SNMP translation routine.
+ */
+static int snmp_translate(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+ u_int16_t udplen = ntohs(udph->len);
+ u_int16_t paylen = udplen - sizeof(struct udphdr);
+ int dir = CTINFO2DIR(ctinfo);
+ struct oct1_map map;
+
+ /*
+ * Determine mappping for application layer addresses based
+ * on NAT manipulations for the packet.
+ */
+ if (dir == IP_CT_DIR_ORIGINAL) {
+ /* SNAT traps */
+ map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
+ map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
+ } else {
+ /* DNAT replies */
+ map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip);
+ map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);
+ }
+
+ if (map.from == map.to)
+ return NF_ACCEPT;
+
+ if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
+ paylen, &map, &udph->check)) {
+ net_warn_ratelimited("bsalg: parser failed\n");
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted */
+static int help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int ret;
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+
+ /* SNMP replies and originating SNMP traps get mangled */
+ if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+ if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* No NAT? */
+ if (!(ct->status & IPS_NAT_MASK))
+ return NF_ACCEPT;
+
+ /*
+ * Make sure the packet length is ok. So far, we were only guaranteed
+ * to have a valid length IP header plus 8 bytes, which means we have
+ * enough room for a UDP header. Just verify the UDP length field so we
+ * can mess around with the payload.
+ */
+ if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
+ net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+ &iph->saddr, &iph->daddr);
+ return NF_DROP;
+ }
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ spin_lock_bh(&snmp_lock);
+ ret = snmp_translate(ct, ctinfo, skb);
+ spin_unlock_bh(&snmp_lock);
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy snmp_exp_policy = {
+ .max_expected = 0,
+ .timeout = 180,
+};
+
+static struct nf_conntrack_helper snmp_helper __read_mostly = {
+ .me = THIS_MODULE,
+ .help = help,
+ .expect_policy = &snmp_exp_policy,
+ .name = "snmp",
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+};
+
+static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
+ .me = THIS_MODULE,
+ .help = help,
+ .expect_policy = &snmp_exp_policy,
+ .name = "snmp_trap",
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+};
+
+/*****************************************************************************
+ *
+ * Module stuff.
+ *
+ *****************************************************************************/
+
+static int __init nf_nat_snmp_basic_init(void)
+{
+ int ret = 0;
+
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+
+ ret = nf_conntrack_helper_register(&snmp_trap_helper);
+ if (ret < 0) {
+ nf_conntrack_helper_unregister(&snmp_helper);
+ return ret;
+ }
+ return ret;
+}
+
+static void __exit nf_nat_snmp_basic_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+ nf_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(nf_nat_snmp_basic_init);
+module_exit(nf_nat_snmp_basic_fini);
+
+module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
new file mode 100644
index 000000000..3262e41ff
--- /dev/null
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -0,0 +1,192 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
+{
+ const struct tcphdr *oth;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return NULL;
+
+ oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
+ sizeof(struct tcphdr), _oth);
+ if (oth == NULL)
+ return NULL;
+
+ /* No RST for RST. */
+ if (oth->rst)
+ return NULL;
+
+ /* Check checksum */
+ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
+ return NULL;
+
+ return oth;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
+
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
+{
+ struct iphdr *niph, *oiph = ip_hdr(oldskb);
+
+ skb_reset_network_header(nskb);
+ niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
+ niph->version = 4;
+ niph->ihl = sizeof(struct iphdr) / 4;
+ niph->tos = 0;
+ niph->id = 0;
+ niph->frag_off = htons(IP_DF);
+ niph->protocol = protocol;
+ niph->check = 0;
+ niph->saddr = oiph->daddr;
+ niph->daddr = oiph->saddr;
+ niph->ttl = ttl;
+
+ nskb->protocol = htons(ETH_P_IP);
+
+ return niph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
+
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
+{
+ struct iphdr *niph = ip_hdr(nskb);
+ struct tcphdr *tcph;
+
+ skb_reset_transport_header(nskb);
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ memset(tcph, 0, sizeof(*tcph));
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+
+ if (oth->ack) {
+ tcph->seq = oth->ack_seq;
+ } else {
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+ oldskb->len - ip_hdrlen(oldskb) -
+ (oth->doff << 2));
+ tcph->ack = 1;
+ }
+
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+ niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)tcph - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
+
+/* Send RST reply */
+void nf_send_reset(struct sk_buff *oldskb, int hook)
+{
+ struct sk_buff *nskb;
+ const struct iphdr *oiph;
+ struct iphdr *niph;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return;
+
+ if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ return;
+
+ oiph = ip_hdr(oldskb);
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ /* ip_route_me_harder expects skb->dst to be set */
+ skb_dst_set_noref(nskb, skb_dst(oldskb));
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ ip4_dst_hoplimit(skb_dst(nskb)));
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+
+ if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ /* "Never happens" */
+ if (nskb->len > dst_mtu(skb_dst(nskb)))
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip_local_out for bridged traffic, the MAC source on
+ * the RST will be ours, instead of the destination's. This confuses
+ * some routers/firewalls, and they drop the packet. So we need to
+ * build the eth header using the original destination's MAC as the
+ * source, and send the RST packet directly.
+ */
+ if (oldskb->nf_bridge) {
+ struct ethhdr *oeth = eth_hdr(oldskb);
+
+ nskb->dev = nf_bridge_get_physindev(oldskb);
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+ if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+ oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ goto free_nskb;
+ dev_queue_xmit(nskb);
+ } else
+#endif
+ ip_local_out(nskb);
+
+ return;
+
+ free_nskb:
+ kfree_skb(nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset);
+
+void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
+{
+ struct iphdr *iph = ip_hdr(skb_in);
+ u8 proto;
+
+ if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET))
+ return;
+
+ if (skb_csum_unnecessary(skb_in)) {
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+ return;
+ }
+
+ if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
+ proto = iph->protocol;
+ else
+ proto = 0;
+
+ if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
new file mode 100644
index 000000000..8412268bb
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter_arp.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int
+nft_do_chain_arp(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static struct nft_af_info nft_af_arp __read_mostly = {
+ .family = NFPROTO_ARP,
+ .nhooks = NF_ARP_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_ARP_IN] = nft_do_chain_arp,
+ [NF_ARP_OUT] = nft_do_chain_arp,
+ [NF_ARP_FORWARD] = nft_do_chain_arp,
+ },
+};
+
+static int nf_tables_arp_init_net(struct net *net)
+{
+ net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.arp== NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
+
+ if (nft_register_afinfo(net, net->nft.arp) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.arp);
+ return -ENOMEM;
+}
+
+static void nf_tables_arp_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.arp);
+ kfree(net->nft.arp);
+}
+
+static struct pernet_operations nf_tables_arp_net_ops = {
+ .init = nf_tables_arp_init_net,
+ .exit = nf_tables_arp_exit_net,
+};
+
+static const struct nf_chain_type filter_arp = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_ARP,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_ARP_IN) |
+ (1 << NF_ARP_OUT) |
+ (1 << NF_ARP_FORWARD),
+};
+
+static int __init nf_tables_arp_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_arp);
+ ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_arp);
+
+ return ret;
+}
+
+static void __exit nf_tables_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_arp_net_ops);
+ nft_unregister_chain_type(&filter_arp);
+}
+
+module_init(nf_tables_arp_init);
+module_exit(nf_tables_arp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
new file mode 100644
index 000000000..aa180d3a6
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+
+static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (unlikely(skb->len < sizeof(struct iphdr) ||
+ ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+ if (net_ratelimit())
+ pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
+ "packet\n");
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain_ipv4(ops, skb, state);
+}
+
+struct nft_af_info nft_af_ipv4 __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
+ [NF_INET_LOCAL_OUT] = nft_ipv4_output,
+ [NF_INET_FORWARD] = nft_do_chain_ipv4,
+ [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
+ [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
+ },
+};
+EXPORT_SYMBOL_GPL(nft_af_ipv4);
+
+static int nf_tables_ipv4_init_net(struct net *net)
+{
+ net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.ipv4 == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
+
+ if (nft_register_afinfo(net, net->nft.ipv4) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.ipv4);
+ return -ENOMEM;
+}
+
+static void nf_tables_ipv4_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.ipv4);
+ kfree(net->nft.ipv4);
+}
+
+static struct pernet_operations nf_tables_ipv4_net_ops = {
+ .init = nf_tables_ipv4_init_net,
+ .exit = nf_tables_ipv4_exit_net,
+};
+
+static const struct nf_chain_type filter_ipv4 = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_ipv4_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_ipv4);
+ ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_ipv4);
+
+ return ret;
+}
+
+static void __exit nf_tables_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
+ nft_unregister_chain_type(&filter_ipv4);
+}
+
+module_init(nf_tables_ipv4_init);
+module_exit(nf_tables_ipv4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
new file mode 100644
index 000000000..bf5c30ae1
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
+}
+
+static const struct nf_chain_type nft_chain_nat_ipv4 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
+ [NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
+ [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
+ [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
+ },
+};
+
+static int __init nft_chain_nat_init(void)
+{
+ int err;
+
+ err = nft_register_chain_type(&nft_chain_nat_ipv4);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_nat_ipv4);
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
new file mode 100644
index 000000000..e335b0afd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ unsigned int ret;
+ struct nft_pktinfo pkt;
+ u32 mark;
+ __be32 saddr, daddr;
+ u_int8_t tos;
+ const struct iphdr *iph;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_DROP && ret != NF_QUEUE) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos)
+ if (ip_route_me_harder(skb, RTN_UNSPEC))
+ ret = NF_DROP;
+ }
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_route_ipv4 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook,
+ },
+};
+
+static int __init nft_chain_route_init(void)
+{
+ return nft_register_chain_type(&nft_chain_route_ipv4);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_route_ipv4);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
new file mode 100644
index 000000000..40e414c4c
--- /dev/null
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_masq.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+static void nft_masq_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_masq *priv = nft_expr_priv(expr);
+ struct nf_nat_range range;
+
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
+
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ &range, pkt->out);
+}
+
+static struct nft_expr_type nft_masq_ipv4_type;
+static const struct nft_expr_ops nft_masq_ipv4_ops = {
+ .type = &nft_masq_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_ipv4_eval,
+ .init = nft_masq_init,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+};
+
+static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "masq",
+ .ops = &nft_masq_ipv4_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_ipv4_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_masq_ipv4_type);
+ if (ret < 0)
+ return ret;
+
+ nf_nat_masquerade_ipv4_register_notifier();
+
+ return ret;
+}
+
+static void __exit nft_masq_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_masq_ipv4_type);
+ nf_nat_masquerade_ipv4_unregister_notifier();
+}
+
+module_init(nft_masq_ipv4_module_init);
+module_exit(nft_masq_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
new file mode 100644
index 000000000..d8d795df9
--- /dev/null
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_redirect.h>
+#include <net/netfilter/nft_redir.h>
+
+static void nft_redir_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_ipv4_multi_range_compat mr;
+
+ memset(&mr, 0, sizeof(mr));
+ if (priv->sreg_proto_min) {
+ mr.range[0].min.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ mr.range[0].max.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+
+ mr.range[0].flags |= priv->flags;
+
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
+ pkt->ops->hooknum);
+}
+
+static struct nft_expr_type nft_redir_ipv4_type;
+static const struct nft_expr_ops nft_redir_ipv4_ops = {
+ .type = &nft_redir_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_ipv4_eval,
+ .init = nft_redir_init,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+};
+
+static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "redir",
+ .ops = &nft_redir_ipv4_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_redir_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_redir_ipv4_type);
+}
+
+static void __exit nft_redir_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_redir_ipv4_type);
+}
+
+module_init(nft_redir_ipv4_module_init);
+module_exit(nft_redir_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 000000000..b07e58b51
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/nft_reject.h>
+
+static void nft_reject_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach(pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ break;
+ default:
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_reject_ipv4_type;
+static const struct nft_expr_ops nft_reject_ipv4_ops = {
+ .type = &nft_reject_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv4_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "reject",
+ .ops = &nft_reject_ipv4_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv4_type);
+}
+
+static void __exit nft_reject_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv4_type);
+}
+
+module_init(nft_reject_ipv4_module_init);
+module_exit(nft_reject_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 000000000..05ff44b75
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,1223 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * "Ping" sockets
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6),
+ * Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#endif
+
+struct ping_table {
+ struct hlist_nulls_head hash[PING_HTABLE_SIZE];
+ rwlock_t lock;
+};
+
+static struct ping_table ping_table;
+struct pingv6_ops pingv6_ops;
+EXPORT_SYMBOL_GPL(pingv6_ops);
+
+static u16 ping_port_rover;
+
+static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
+{
+ u32 res = (num + net_hash_mix(net)) & mask;
+
+ pr_debug("hash(%u) = %u\n", num, res);
+ return res;
+}
+EXPORT_SYMBOL_GPL(ping_hash);
+
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+ struct net *net, unsigned int num)
+{
+ return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+int ping_get_port(struct sock *sk, unsigned short ident)
+{
+ struct hlist_nulls_node *node;
+ struct hlist_nulls_head *hlist;
+ struct inet_sock *isk, *isk2;
+ struct sock *sk2 = NULL;
+
+ isk = inet_sk(sk);
+ write_lock_bh(&ping_table.lock);
+ if (ident == 0) {
+ u32 i;
+ u16 result = ping_port_rover + 1;
+
+ for (i = 0; i < (1L << 16); i++, result++) {
+ if (!result)
+ result++; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, sock_net(sk),
+ result);
+ ping_portaddr_for_each_entry(sk2, node, hlist) {
+ isk2 = inet_sk(sk2);
+
+ if (isk2->inet_num == result)
+ goto next_port;
+ }
+
+ /* found */
+ ping_port_rover = ident = result;
+ break;
+next_port:
+ ;
+ }
+ if (i >= (1L << 16))
+ goto fail;
+ } else {
+ hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ ping_portaddr_for_each_entry(sk2, node, hlist) {
+ isk2 = inet_sk(sk2);
+
+ /* BUG? Why is this reuse and not reuseaddr? ping.c
+ * doesn't turn off SO_REUSEADDR, and it doesn't expect
+ * that other ping processes can steal its packets.
+ */
+ if ((isk2->inet_num == ident) &&
+ (sk2 != sk) &&
+ (!sk2->sk_reuse || !sk->sk_reuse))
+ goto fail;
+ }
+ }
+
+ pr_debug("found port/ident = %d\n", ident);
+ isk->inet_num = ident;
+ if (sk_unhashed(sk)) {
+ pr_debug("was not hashed\n");
+ sock_hold(sk);
+ hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ }
+ write_unlock_bh(&ping_table.lock);
+ return 0;
+
+fail:
+ write_unlock_bh(&ping_table.lock);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ping_get_port);
+
+void ping_hash(struct sock *sk)
+{
+ pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+ BUG(); /* "Please do not press this button again." */
+}
+
+void ping_unhash(struct sock *sk)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ if (sk_hashed(sk)) {
+ write_lock_bh(&ping_table.lock);
+ hlist_nulls_del(&sk->sk_nulls_node);
+ sk_nulls_node_init(&sk->sk_nulls_node);
+ sock_put(sk);
+ isk->inet_num = 0;
+ isk->inet_sport = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&ping_table.lock);
+ }
+}
+EXPORT_SYMBOL_GPL(ping_unhash);
+
+static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
+{
+ struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+ struct sock *sk = NULL;
+ struct inet_sock *isk;
+ struct hlist_nulls_node *hnode;
+ int dif = skb->dev->ifindex;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+ (int)ident, &ip_hdr(skb)->daddr, dif);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
+ (int)ident, &ipv6_hdr(skb)->daddr, dif);
+#endif
+ }
+
+ read_lock_bh(&ping_table.lock);
+
+ ping_portaddr_for_each_entry(sk, hnode, hslot) {
+ isk = inet_sk(sk);
+
+ pr_debug("iterate\n");
+ if (isk->inet_num != ident)
+ continue;
+
+ if (skb->protocol == htons(ETH_P_IP) &&
+ sk->sk_family == AF_INET) {
+ pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
+ (int) isk->inet_num, &isk->inet_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (isk->inet_rcv_saddr &&
+ isk->inet_rcv_saddr != ip_hdr(skb)->daddr)
+ continue;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6) &&
+ sk->sk_family == AF_INET6) {
+
+ pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
+ (int) isk->inet_num,
+ &sk->sk_v6_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
+ &ipv6_hdr(skb)->daddr))
+ continue;
+#endif
+ } else {
+ continue;
+ }
+
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
+ sock_hold(sk);
+ goto exit;
+ }
+
+ sk = NULL;
+exit:
+ read_unlock_bh(&ping_table.lock);
+
+ return sk;
+}
+
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+ kgid_t *high)
+{
+ kgid_t *data = net->ipv4.ping_group_range.range;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
+
+
+int ping_init_sock(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ kgid_t group = current_egid();
+ struct group_info *group_info;
+ int i, j, count;
+ kgid_t low, high;
+ int ret = 0;
+
+ if (sk->sk_family == AF_INET6)
+ sk->sk_ipv6only = 1;
+
+ inet_get_ping_group_range_net(net, &low, &high);
+ if (gid_lte(low, group) && gid_lte(group, high))
+ return 0;
+
+ group_info = get_current_groups();
+ count = group_info->ngroups;
+ for (i = 0; i < group_info->nblocks; i++) {
+ int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+ for (j = 0; j < cp_count; j++) {
+ kgid_t gid = group_info->blocks[i][j];
+ if (gid_lte(low, gid) && gid_lte(gid, high))
+ goto out_release_group;
+ }
+
+ count -= cp_count;
+ }
+
+ ret = -EACCES;
+
+out_release_group:
+ put_group_info(group_info);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ping_init_sock);
+
+void ping_close(struct sock *sk, long timeout)
+{
+ pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num);
+ pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+
+ sk_common_release(sk);
+}
+EXPORT_SYMBOL_GPL(ping_close);
+
+/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
+ struct sockaddr *uaddr, int addr_len) {
+ struct net *net = sock_net(sk);
+ if (sk->sk_family == AF_INET) {
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int chk_addr_ret;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin_family != AF_INET &&
+ !(addr->sin_family == AF_UNSPEC &&
+ addr->sin_addr.s_addr == htonl(INADDR_ANY)))
+ return -EAFNOSUPPORT;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
+ sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+ chk_addr_ret = RTN_LOCAL;
+
+ if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
+ isk->freebind == 0 && isk->transparent == 0 &&
+ chk_addr_ret != RTN_LOCAL) ||
+ chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST)
+ return -EADDRNOTAVAIL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+ int addr_type, scoped, has_addr;
+ struct net_device *dev = NULL;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
+ sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+ scoped = __ipv6_addr_needs_scope_id(addr_type);
+ if ((addr_type != IPV6_ADDR_ANY &&
+ !(addr_type & IPV6_ADDR_UNICAST)) ||
+ (scoped && !addr->sin6_scope_id))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (addr->sin6_scope_id) {
+ dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
+ has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
+ scoped);
+ rcu_read_unlock();
+
+ if (!(isk->freebind || isk->transparent || has_addr ||
+ addr_type == IPV6_ADDR_ANY))
+ return -EADDRNOTAVAIL;
+
+ if (scoped)
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+#endif
+ } else {
+ return -EAFNOSUPPORT;
+ }
+ return 0;
+}
+
+static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+{
+ if (saddr->sa_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) saddr;
+ isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (saddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
+#endif
+ }
+}
+
+static void ping_clear_saddr(struct sock *sk, int dif)
+{
+ sk->sk_bound_dev_if = dif;
+ if (sk->sk_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ isk->inet_rcv_saddr = isk->inet_saddr = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
+ memset(&np->saddr, 0, sizeof(np->saddr));
+#endif
+ }
+}
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ unsigned short snum;
+ int err;
+ int dif = sk->sk_bound_dev_if;
+
+ err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (isk->inet_num != 0)
+ goto out;
+
+ err = -EADDRINUSE;
+ ping_set_saddr(sk, uaddr);
+ snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
+ if (ping_get_port(sk, snum) != 0) {
+ ping_clear_saddr(sk, dif);
+ goto out;
+ }
+
+ pr_debug("after bind(): num = %d, dif = %d\n",
+ (int)isk->inet_num,
+ (int)sk->sk_bound_dev_if);
+
+ err = 0;
+ if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
+
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ isk->inet_sport = htons(isk->inet_num);
+ isk->inet_daddr = 0;
+ isk->inet_dport = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+#endif
+
+ sk_dst_reset(sk);
+out:
+ release_sock(sk);
+ pr_debug("ping_v4_bind -> %d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ping_bind);
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int family, int type, int code)
+{
+ return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+void ping_err(struct sk_buff *skb, int offset, u32 info)
+{
+ int family;
+ struct icmphdr *icmph;
+ struct inet_sock *inet_sock;
+ int type;
+ int code;
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ int harderr;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ family = AF_INET;
+ type = icmp_hdr(skb)->type;
+ code = icmp_hdr(skb)->code;
+ icmph = (struct icmphdr *)(skb->data + offset);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ family = AF_INET6;
+ type = icmp6_hdr(skb)->icmp6_type;
+ code = icmp6_hdr(skb)->icmp6_code;
+ icmph = (struct icmphdr *) (skb->data + offset);
+ } else {
+ BUG();
+ }
+
+ /* We assume the packet has already been checked by icmp_unreach */
+
+ if (!ping_supported(family, icmph->type, icmph->code))
+ return;
+
+ pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
+ skb->protocol, type, code, ntohs(icmph->un.echo.id),
+ ntohs(icmph->un.echo.sequence));
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (!sk) {
+ pr_debug("no socket, dropping\n");
+ return; /* No socket for error */
+ }
+ pr_debug("err on socket %p\n", sk);
+
+ err = 0;
+ harderr = 0;
+ inet_sock = inet_sk(sk);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ /* This is not a real error but ping wants to see it.
+ * Report it with some fake errno.
+ */
+ err = EREMOTEIO;
+ break;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ /* See ICMP_SOURCE_QUENCH */
+ ipv4_sk_redirect(skb, sk);
+ err = EREMOTEIO;
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
+#endif
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if ((family == AF_INET && !inet_sock->recverr) ||
+ (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
+ if (family == AF_INET) {
+ ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+ info, (u8 *)icmph);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
+ info, (u8 *)icmph);
+#endif
+ }
+ }
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(ping_err);
+
+/*
+ * Copy and checksum an ICMP Echo packet from user space into a buffer
+ * starting from the payload.
+ */
+
+int ping_getfrag(void *from, char *to,
+ int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+ struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+
+ if (offset == 0) {
+ fraglen -= sizeof(struct icmphdr);
+ if (fraglen < 0)
+ BUG();
+ if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+ fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter) != fraglen)
+ return -EFAULT;
+ } else if (offset < sizeof(struct icmphdr)) {
+ BUG();
+ } else {
+ if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter) != fraglen)
+ return -EFAULT;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* For IPv6, checksum each skb as we go along, as expected by
+ * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
+ * wcheck, it will be finalized in ping_v4_push_pending_frames.
+ */
+ if (pfh->family == AF_INET6) {
+ skb->csum = pfh->wcheck;
+ skb->ip_summed = CHECKSUM_NONE;
+ pfh->wcheck = 0;
+ }
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_getfrag);
+
+static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+ struct flowi4 *fl4)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+ pfh->wcheck = csum_partial((char *)&pfh->icmph,
+ sizeof(struct icmphdr), pfh->wcheck);
+ pfh->icmph.checksum = csum_fold(pfh->wcheck);
+ memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+ skb->ip_summed = CHECKSUM_NONE;
+ return ip_push_pending_frames(sk, fl4);
+}
+
+int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
+ void *user_icmph, size_t icmph_len) {
+ u8 type, code;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /*
+ * Fetch the ICMP header provided by the userland.
+ * iovec is modified! The ICMP header is consumed.
+ */
+ if (memcpy_from_msg(user_icmph, msg, icmph_len))
+ return -EFAULT;
+
+ if (family == AF_INET) {
+ type = ((struct icmphdr *) user_icmph)->type;
+ code = ((struct icmphdr *) user_icmph)->code;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
+ code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
+#endif
+ } else {
+ BUG();
+ }
+
+ if (!ping_supported(family, type, code))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+
+static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct net *net = sock_net(sk);
+ struct flowi4 fl4;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct icmphdr user_icmph;
+ struct pingfakehdr pfh;
+ struct rtable *rt = NULL;
+ struct ip_options_data opt_copy;
+ int free = 0;
+ __be32 saddr, daddr, faddr;
+ u8 tos;
+ int err;
+
+ pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+ err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+ daddr = usin->sin_addr.s_addr;
+ /* no remote port */
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->inet_daddr;
+ /* no remote port */
+ }
+
+ ipc.addr = inet->inet_saddr;
+ ipc.opt = NULL;
+ ipc.oif = sk->sk_bound_dev_if;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ }
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ faddr = ipc.opt->opt.faddr;
+ }
+ tos = get_rttos(&ipc, inet);
+ if (sock_flag(sk, SOCK_LOCALROUTE) ||
+ (msg->msg_flags & MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->opt.is_strictroute)) {
+ tos |= RTO_ONLINK;
+ }
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (!ipc.addr)
+ ipc.addr = fl4.daddr;
+
+ lock_sock(sk);
+
+ pfh.icmph.type = user_icmph.type; /* already checked */
+ pfh.icmph.code = user_icmph.code; /* ditto */
+ pfh.icmph.checksum = 0;
+ pfh.icmph.un.echo.id = inet->inet_sport;
+ pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+ pfh.msg = msg;
+ pfh.wcheck = 0;
+ pfh.family = AF_INET;
+
+ err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+ 0, &ipc, &rt, msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else
+ err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err) {
+ icmp_out_count(sock_net(sk), user_icmph.type);
+ return len;
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ int family = sk->sk_family;
+ struct sk_buff *skb;
+ int copied, err;
+
+ pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+ err = -EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (flags & MSG_ERRQUEUE)
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ /* Don't bother checking the checksum */
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address and add cmsg data. */
+ if (family == AF_INET) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ if (isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *ip6 = ipv6_hdr(skb);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ if (sin6) {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+ sin6->sin6_flowinfo = 0;
+ if (np->sndflow)
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
+ *addr_len = sizeof(*sin6);
+ }
+
+ if (inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+#endif
+ } else {
+ BUG();
+ }
+
+ err = copied;
+
+done:
+ skb_free_datagram(sk, skb);
+out:
+ pr_debug("ping_recvmsg -> %d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ping_recvmsg);
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num, skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ pr_debug("ping_queue_rcv_skb -> failed\n");
+ return -1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+
+
+/*
+ * All we need to do is get the socket.
+ */
+
+bool ping_rcv(struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct net *net = dev_net(skb->dev);
+ struct icmphdr *icmph = icmp_hdr(skb);
+
+ /* We assume the packet has already been checked by icmp_rcv */
+
+ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+ skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+ /* Push ICMP header back */
+ skb_push(skb, skb->data - (u8 *)icmph);
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (sk) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ pr_debug("rcv on socket %p\n", sk);
+ if (skb2)
+ ping_queue_rcv_skb(sk, skb2);
+ sock_put(sk);
+ return true;
+ }
+ pr_debug("no socket, dropping\n");
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(ping_rcv);
+
+struct proto ping_prot = {
+ .name = "PING",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .sendmsg = ping_v4_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = ping_hash,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .obj_size = sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+ struct sock *sk;
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+ ++state->bucket) {
+ struct hlist_nulls_node *node;
+ struct hlist_nulls_head *hslot;
+
+ hslot = &ping_table.hash[state->bucket];
+
+ if (hlist_nulls_empty(hslot))
+ continue;
+
+ sk_nulls_for_each(sk, node, hslot) {
+ if (net_eq(sock_net(sk), net) &&
+ sk->sk_family == state->family)
+ goto found;
+ }
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ do {
+ sk = sk_nulls_next(sk);
+ } while (sk && (!net_eq(sock_net(sk), net)));
+
+ if (!sk)
+ return ping_get_first(seq, state->bucket + 1);
+ return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = ping_get_first(seq, 0);
+
+ if (sk)
+ while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+{
+ struct ping_iter_state *state = seq->private;
+ state->bucket = 0;
+ state->family = family;
+
+ read_lock_bh(&ping_table.lock);
+
+ return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(ping_seq_start);
+
+static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET);
+}
+
+void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = ping_get_idx(seq, 0);
+ else
+ sk = ping_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(ping_seq_next);
+
+void ping_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock_bh(&ping_table.lock);
+}
+EXPORT_SYMBOL_GPL(ping_seq_stop);
+
+static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ atomic_read(&sp->sk_drops));
+}
+
+static int ping_v4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct ping_iter_state *state = seq->private;
+
+ ping_v4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct seq_operations ping_v4_seq_ops = {
+ .show = ping_v4_seq_show,
+ .start = ping_v4_seq_start,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+};
+
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+ struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
+ return seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct ping_iter_state));
+}
+
+const struct file_operations ping_seq_fops = {
+ .open = ping_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+EXPORT_SYMBOL_GPL(ping_seq_fops);
+
+static struct ping_seq_afinfo ping_v4_seq_afinfo = {
+ .name = "icmp",
+ .family = AF_INET,
+ .seq_fops = &ping_seq_fops,
+ .seq_ops = {
+ .start = ping_v4_seq_start,
+ .show = ping_v4_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+ },
+};
+
+int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
+{
+ struct proc_dir_entry *p;
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_proc_register);
+
+void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL_GPL(ping_proc_unregister);
+
+static int __net_init ping_v4_proc_init_net(struct net *net)
+{
+ return ping_proc_register(net, &ping_v4_seq_afinfo);
+}
+
+static void __net_exit ping_v4_proc_exit_net(struct net *net)
+{
+ ping_proc_unregister(net, &ping_v4_seq_afinfo);
+}
+
+static struct pernet_operations ping_v4_net_ops = {
+ .init = ping_v4_proc_init_net,
+ .exit = ping_v4_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+ return register_pernet_subsys(&ping_v4_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+ unregister_pernet_subsys(&ping_v4_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+ int i;
+
+ for (i = 0; i < PING_HTABLE_SIZE; i++)
+ INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+ rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 000000000..e1f3b911d
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,540 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * This file implements the various access functions for the
+ * PROC file system. It is mainly used for debugging and
+ * statistics.
+ *
+ * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ * Alan Cox : UDP sockets show the rxqueue/txqueue
+ * using hint flag for the netinfo.
+ * Pauline Middelink : identd support
+ * Alan Cox : Make /proc safer.
+ * Erik Schoenfelder : /proc/net/snmp
+ * Alan Cox : Handle dead sockets properly.
+ * Gerhard Koerting : Show both timers
+ * Alan Cox : Allow inode to be NULL (kernel socket)
+ * Andi Kleen : Add support for open_requests and
+ * split functions for more readibility.
+ * Andi Kleen : Add support for /proc/net/netstat
+ * Arnaldo C. Melo : Convert to seq_file
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/bottom_half.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+/*
+ * Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq->private;
+ unsigned int frag_mem;
+ int orphans, sockets;
+
+ local_bh_disable();
+ orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+ sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
+ local_bh_enable();
+
+ socket_seq_show(seq);
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+ sock_prot_inuse_get(net, &tcp_prot), orphans,
+ atomic_read(&tcp_death_row.tw_count), sockets,
+ proto_memory_allocated(&tcp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %ld\n",
+ sock_prot_inuse_get(net, &udp_prot),
+ proto_memory_allocated(&udp_prot));
+ seq_printf(seq, "UDPLITE: inuse %d\n",
+ sock_prot_inuse_get(net, &udplite_prot));
+ seq_printf(seq, "RAW: inuse %d\n",
+ sock_prot_inuse_get(net, &raw_prot));
+ frag_mem = ip_frag_mem(net);
+ seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+ return 0;
+}
+
+static int sockstat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, sockstat_seq_show);
+}
+
+static const struct file_operations sockstat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = sockstat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+/* snmp items */
+static const struct snmp_mib snmp4_ipstats_list[] = {
+ SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
+ SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+ SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+ SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+ SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+ SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+ SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+ SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
+ SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+ SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+ SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+ SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+ SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+ SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+ SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+ SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+ SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+ SNMP_MIB_SENTINEL
+};
+
+/* Following items are displayed in /proc/net/netstat */
+static const struct snmp_mib snmp4_ipextstats_list[] = {
+ SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
+ SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+ SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+ SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+ SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
+ SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
+ SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
+ SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
+ SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+ SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+ SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+ SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* Non RFC4293 fields */
+ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct {
+ const char *name;
+ int index;
+} icmpmibmap[] = {
+ { "DestUnreachs", ICMP_DEST_UNREACH },
+ { "TimeExcds", ICMP_TIME_EXCEEDED },
+ { "ParmProbs", ICMP_PARAMETERPROB },
+ { "SrcQuenchs", ICMP_SOURCE_QUENCH },
+ { "Redirects", ICMP_REDIRECT },
+ { "Echos", ICMP_ECHO },
+ { "EchoReps", ICMP_ECHOREPLY },
+ { "Timestamps", ICMP_TIMESTAMP },
+ { "TimestampReps", ICMP_TIMESTAMPREPLY },
+ { "AddrMasks", ICMP_ADDRESS },
+ { "AddrMaskReps", ICMP_ADDRESSREPLY },
+ { NULL, 0 }
+};
+
+
+static const struct snmp_mib snmp4_tcp_list[] = {
+ SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+ SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+ SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+ SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+ SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+ SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+ SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+ SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+ SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+ SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+ SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+ SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+ SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+ SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_udp_list[] = {
+ SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+ SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+ SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+ SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+ SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+ SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_net_list[] = {
+ SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+ SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+ SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+ SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+ SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+ SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+ SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+ SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+ SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+ SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+ SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+ SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+ SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+ SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
+ SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+ SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+ SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+ SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+ SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+ SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+ SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
+ SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
+ SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
+ SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
+ SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+ SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
+ SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+ SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+ SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+ SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+ SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+ SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
+ SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+ SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+ SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+ SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+ SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+ SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+ SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+ SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+ SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+ SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+ SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+ SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+ SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
+ SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+ SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+ SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
+ SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+ SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+ SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
+ SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+ SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+ SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+ SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+ SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+ SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+ SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+ SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+ SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+ SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+ SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+ SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
+ SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
+ SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
+ SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+ SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
+ SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
+ SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
+ SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
+ SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+ SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+ SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+ SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
+ SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
+ SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+ SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+ SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+ SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+ SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+ SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+ SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+ SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+ SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+ SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+ SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+ SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
+ SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+ SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+ SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
+ SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT),
+ SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
+ SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
+ SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+ SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+ SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+ SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+ SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+ SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+ SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+ SNMP_MIB_SENTINEL
+};
+
+static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
+ unsigned short *type, int count)
+{
+ int j;
+
+ if (count) {
+ seq_puts(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %sType%u",
+ type[j] & 0x100 ? "Out" : "In",
+ type[j] & 0xff);
+ seq_puts(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %lu", vals[j]);
+ }
+}
+
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE 16
+
+ int i, count;
+ unsigned short type[PERLINE];
+ unsigned long vals[PERLINE], val;
+ struct net *net = seq->private;
+
+ count = 0;
+ for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+ val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
+ if (val) {
+ type[count] = i;
+ vals[count++] = val;
+ }
+ if (count == PERLINE) {
+ icmpmsg_put_line(seq, vals, type, count);
+ count = 0;
+ }
+ }
+ icmpmsg_put_line(seq, vals, type, count);
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+ int i;
+ struct net *net = seq->private;
+ atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
+
+ seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " In%s", icmpmibmap[i].name);
+ seq_puts(seq, " OutMsgs OutErrors");
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " Out%s", icmpmibmap[i].name);
+ seq_printf(seq, "\nIcmp: %lu %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + icmpmibmap[i].index));
+ seq_printf(seq, " %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+}
+
+/*
+ * Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+ struct net *net = seq->private;
+
+ seq_puts(seq, "Ip: Forwarding DefaultTTL");
+
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+ seq_printf(seq, "\nIp: %d %d",
+ IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
+ sysctl_ip_default_ttl);
+
+ BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+
+ icmp_put(seq); /* RFC 2011 compatibility */
+ icmpmsg_put(seq);
+
+ seq_puts(seq, "\nTcp:");
+ for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+
+ seq_puts(seq, "\nTcp:");
+ for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+ /* MaxConn field is signed, RFC 2012 */
+ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+ seq_printf(seq, " %ld",
+ snmp_fold_field(net->mib.tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ else
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ }
+
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.udp_statistics,
+ snmp4_udp_list[i].entry));
+
+ /* the UDP and UDP-Lite MIBs are the same */
+ seq_puts(seq, "\nUdpLite:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+ seq_puts(seq, "\nUdpLite:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.udplite_statistics,
+ snmp4_udp_list[i].entry));
+
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int snmp_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, snmp_seq_show);
+}
+
+static const struct file_operations snmp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = snmp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+
+
+/*
+ * Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+ struct net *net = seq->private;
+
+ seq_puts(seq, "TcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_net_list[i].name);
+
+ seq_puts(seq, "\nTcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.net_statistics,
+ snmp4_net_list[i].entry));
+
+ seq_puts(seq, "\nIpExt:");
+ for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+
+ seq_puts(seq, "\nIpExt:");
+ for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int netstat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, netstat_seq_show);
+}
+
+static const struct file_operations netstat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = netstat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static __net_init int ip_proc_init_net(struct net *net)
+{
+ if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+ &sockstat_seq_fops))
+ goto out_sockstat;
+ if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
+ goto out_netstat;
+ if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
+ goto out_snmp;
+
+ return 0;
+
+out_snmp:
+ remove_proc_entry("netstat", net->proc_net);
+out_netstat:
+ remove_proc_entry("sockstat", net->proc_net);
+out_sockstat:
+ return -ENOMEM;
+}
+
+static __net_exit void ip_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("snmp", net->proc_net);
+ remove_proc_entry("netstat", net->proc_net);
+ remove_proc_entry("sockstat", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations ip_proc_ops = {
+ .init = ip_proc_init_net,
+ .exit = ip_proc_exit_net,
+};
+
+int __init ip_misc_proc_init(void)
+{
+ return register_pernet_subsys(&ip_proc_ops);
+}
+
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 000000000..4b7c0ec65
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,79 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * INET protocol dispatch tables.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : Ahah! udp icmp errors don't work because
+ * udp_err is never called!
+ * Alan Cox : Added new fields for init and ready for
+ * proper fragmentation (_NO_ 4K limits!)
+ * Richard Colella : Hang on hash collision
+ * Vince Laviano : Modified inet_del_protocol() to correctly
+ * maintain copy bit.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+
+const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_offloads);
+
+int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+ if (!prot->netns_ok) {
+ pr_err("Protocol %u is not namespace aware, cannot register.\n",
+ protocol);
+ return -EINVAL;
+ }
+
+ return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_protocol);
+
+int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_offload);
+
+int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_del_protocol);
+
+int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 000000000..561cd4b8f
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,1100 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * RAW - implementation of IP "raw" sockets.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() fixed up
+ * Alan Cox : ICMP error handling
+ * Alan Cox : EMSGSIZE if you send too big a packet
+ * Alan Cox : Now uses generic datagrams and shared
+ * skbuff library. No more peek crashes,
+ * no more backlogs
+ * Alan Cox : Checks sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
+ * Alan Cox : Raw passes ip options too
+ * Alan Cox : Setsocketopt added
+ * Alan Cox : Fixed error return for broadcasts
+ * Alan Cox : Removed wake_up calls
+ * Alan Cox : Use ttl/tos
+ * Alan Cox : Cleaned up old debugging
+ * Alan Cox : Use new kernel side addresses
+ * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
+ * Alan Cox : BSD style RAW socket demultiplexing.
+ * Alan Cox : Beginnings of mrouted support.
+ * Alan Cox : Added IP_HDRINCL option.
+ * Alan Cox : Skip broadcast check if BSDism set.
+ * David S. Miller : New socket lookup architecture.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/igmp.h>
+#include <net/net_namespace.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/tcp_states.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/uio.h>
+
+struct raw_frag_vec {
+ struct msghdr *msg;
+ union {
+ struct icmphdr icmph;
+ char c[1];
+ } hdr;
+ int hlen;
+};
+
+static struct raw_hashinfo raw_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
+};
+
+void raw_hash_sk(struct sock *sk)
+{
+ struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+ struct hlist_head *head;
+
+ head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+
+ write_lock_bh(&h->lock);
+ sk_add_node(sk, head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_hash_sk);
+
+void raw_unhash_sk(struct sock *sk)
+{
+ struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+
+ write_lock_bh(&h->lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_unhash_sk);
+
+static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+ unsigned short num, __be32 raddr, __be32 laddr, int dif)
+{
+ sk_for_each_from(sk) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
+ !(inet->inet_daddr && inet->inet_daddr != raddr) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ goto found; /* gotcha */
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+/*
+ * 0 - deliver
+ * 1 - block
+ */
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct icmphdr _hdr;
+ const struct icmphdr *hdr;
+
+ hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 1;
+
+ if (hdr->type < 32) {
+ __u32 data = raw_sk(sk)->filter.data;
+
+ return ((1U << hdr->type) & data) != 0;
+ }
+
+ /* Do not block unknown ICMP types */
+ return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
+{
+ struct sock *sk;
+ struct hlist_head *head;
+ int delivered = 0;
+ struct net *net;
+
+ read_lock(&raw_v4_hashinfo.lock);
+ head = &raw_v4_hashinfo.ht[hash];
+ if (hlist_empty(head))
+ goto out;
+
+ net = dev_net(skb->dev);
+ sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+
+ while (sk) {
+ delivered = 1;
+ if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+ ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+ skb->dev->ifindex)) {
+ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+ /* Not releasing hash table! */
+ if (clone)
+ raw_rcv(sk, clone);
+ }
+ sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+ }
+out:
+ read_unlock(&raw_v4_hashinfo.lock);
+ return delivered;
+}
+
+int raw_local_deliver(struct sk_buff *skb, int protocol)
+{
+ int hash;
+ struct sock *raw_sk;
+
+ hash = protocol & (RAW_HTABLE_SIZE - 1);
+ raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
+ */
+ if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
+ raw_sk = NULL;
+
+ return raw_sk != NULL;
+
+}
+
+static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ int err = 0;
+ int harderr = 0;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_sk_update_pmtu(skb, sk, info);
+ else if (type == ICMP_REDIRECT) {
+ ipv4_sk_redirect(skb, sk);
+ return;
+ }
+
+ /* Report error on raw socket, if:
+ 1. User requested ip_recverr.
+ 2. Socket is connected (otherwise the error indication
+ is useless without ip_recverr and error is hard.
+ */
+ if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ return;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ err = EHOSTUNREACH;
+ if (code > NR_ICMP_UNREACH)
+ break;
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
+ if (code == ICMP_FRAG_NEEDED) {
+ harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ err = EMSGSIZE;
+ }
+ }
+
+ if (inet->recverr) {
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 *payload = skb->data + (iph->ihl << 2);
+
+ if (inet->hdrincl)
+ payload = skb->data;
+ ip_icmp_error(sk, skb, err, 0, info, payload);
+ }
+
+ if (inet->recverr || harderr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ }
+}
+
+void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
+{
+ int hash;
+ struct sock *raw_sk;
+ const struct iphdr *iph;
+ struct net *net;
+
+ hash = protocol & (RAW_HTABLE_SIZE - 1);
+
+ read_lock(&raw_v4_hashinfo.lock);
+ raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+ if (raw_sk) {
+ iph = (const struct iphdr *)skb->data;
+ net = dev_net(skb->dev);
+
+ while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
+ iph->daddr, iph->saddr,
+ skb->dev->ifindex)) != NULL) {
+ raw_err(raw_sk, skb, info);
+ raw_sk = sk_next(raw_sk);
+ iph = (const struct iphdr *)skb->data;
+ }
+ }
+ read_unlock(&raw_v4_hashinfo.lock);
+}
+
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ /* Charge it to the socket. */
+
+ ipv4_pktinfo_prepare(sk, skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ nf_reset(skb);
+
+ skb_push(skb, skb->data - skb_network_header(skb));
+
+ raw_rcv_skb(sk, skb);
+ return 0;
+}
+
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+ struct msghdr *msg, size_t length,
+ struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ struct iphdr *iph;
+ struct sk_buff *skb;
+ unsigned int iphlen;
+ int err;
+ struct rtable *rt = *rtp;
+ int hlen, tlen;
+
+ if (length > rt->dst.dev->mtu) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ rt->dst.dev->mtu);
+ return -EMSGSIZE;
+ }
+ if (flags&MSG_PROBE)
+ goto out;
+
+ hlen = LL_RESERVED_SPACE(rt->dst.dev);
+ tlen = rt->dst.dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk,
+ length + hlen + tlen + 15,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto error;
+ skb_reserve(skb, hlen);
+
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ skb_dst_set(skb, &rt->dst);
+ *rtp = NULL;
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ skb_put(skb, length);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
+ skb->transport_header = skb->network_header;
+ err = -EFAULT;
+ if (memcpy_from_msg(iph, msg, length))
+ goto error_free;
+
+ iphlen = iph->ihl * 4;
+
+ /*
+ * We don't want to modify the ip header, but we do need to
+ * be sure that it won't cause problems later along the network
+ * stack. Specifically we want to make sure that iph->ihl is a
+ * sane value. If ihl points beyond the length of the buffer passed
+ * in, reject the frame as invalid
+ */
+ err = -EINVAL;
+ if (iphlen > length)
+ goto error_free;
+
+ if (iphlen >= sizeof(*iph)) {
+ if (!iph->saddr)
+ iph->saddr = fl4->saddr;
+ iph->check = 0;
+ iph->tot_len = htons(length);
+ if (!iph->id)
+ ip_select_ident(net, skb, NULL);
+
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ }
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
+
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, rt->dst.dev, dst_output_sk);
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ goto error;
+out:
+ return 0;
+
+error_free:
+ kfree_skb(skb);
+error:
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+ if (err == -ENOBUFS && !inet->recverr)
+ err = 0;
+ return err;
+}
+
+static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
+{
+ int err;
+
+ if (fl4->flowi4_proto != IPPROTO_ICMP)
+ return 0;
+
+ /* We only need the first two bytes. */
+ rfv->hlen = 2;
+
+ err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
+ if (err)
+ return err;
+
+ fl4->fl4_icmp_type = rfv->hdr.icmph.type;
+ fl4->fl4_icmp_code = rfv->hdr.icmph.code;
+
+ return 0;
+}
+
+static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct raw_frag_vec *rfv = from;
+
+ if (offset < rfv->hlen) {
+ int copy = min(rfv->hlen - offset, len);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ memcpy(to, rfv->hdr.c + offset, copy);
+ else
+ skb->csum = csum_block_add(
+ skb->csum,
+ csum_partial_copy_nocheck(rfv->hdr.c + offset,
+ to, copy, 0),
+ odd);
+
+ odd = 0;
+ offset += copy;
+ to += copy;
+ len -= copy;
+
+ if (!len)
+ return 0;
+ }
+
+ offset -= rfv->hlen;
+
+ return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ int free = 0;
+ __be32 daddr;
+ __be32 saddr;
+ u8 tos;
+ int err;
+ struct ip_options_data opt_copy;
+ struct raw_frag_vec rfv;
+
+ err = -EMSGSIZE;
+ if (len > 0xFFFF)
+ goto out;
+
+ /*
+ * Check the flags.
+ */
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
+ goto out; /* compatibility */
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_namelen) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(*usin))
+ goto out;
+ if (usin->sin_family != AF_INET) {
+ pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+ __func__, current->comm);
+ err = -EAFNOSUPPORT;
+ if (usin->sin_family)
+ goto out;
+ }
+ daddr = usin->sin_addr.s_addr;
+ /* ANK: I did not forget to get protocol from port field.
+ * I just do not know, who uses this weirdness.
+ * IP_HDRINCL is much more convenient.
+ */
+ } else {
+ err = -EDESTADDRREQ;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ daddr = inet->inet_daddr;
+ }
+
+ ipc.addr = inet->inet_saddr;
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+ ipc.oif = sk->sk_bound_dev_if;
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ if (err)
+ goto out;
+ if (ipc.opt)
+ free = 1;
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = daddr;
+
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ if (ipc.opt) {
+ err = -EINVAL;
+ /* Linux does not mangle headers on raw sockets,
+ * so that IP options + IP_HDRINCL is non-sense.
+ */
+ if (inet->hdrincl)
+ goto done;
+ if (ipc.opt->opt.srr) {
+ if (!daddr)
+ goto done;
+ daddr = ipc.opt->opt.faddr;
+ }
+ }
+ tos = get_rtconn_flags(&ipc, sk);
+ if (msg->msg_flags & MSG_DONTROUTE)
+ tos |= RTO_ONLINK;
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk) |
+ (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+ daddr, saddr, 0, 0);
+
+ if (!inet->hdrincl) {
+ rfv.msg = msg;
+ rfv.hlen = 0;
+
+ err = raw_probe_proto_opt(&rfv, &fl4);
+ if (err)
+ goto done;
+ }
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto done;
+ }
+
+ err = -EACCES;
+ if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+ goto done;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (inet->hdrincl)
+ err = raw_send_hdrinc(sk, &fl4, msg, len,
+ &rt, msg->msg_flags);
+
+ else {
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (!ipc.addr)
+ ipc.addr = fl4.daddr;
+ lock_sock(sk);
+ err = ip_append_data(sk, &fl4, raw_getfrag,
+ &rfv, len, 0,
+ &ipc, &rt, msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else if (!(msg->msg_flags & MSG_MORE)) {
+ err = ip_push_pending_frames(sk, &fl4);
+ if (err == -ENOBUFS && !inet->recverr)
+ err = 0;
+ }
+ release_sock(sk);
+ }
+done:
+ if (free)
+ kfree(ipc.opt);
+ ip_rt_put(rt);
+
+out:
+ if (err < 0)
+ return err;
+ return len;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto done;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+ /*
+ * Raw sockets may have direct kernel references. Kill them.
+ */
+ ip_ra_control(sk, 0, NULL);
+
+ sk_common_release(sk);
+}
+
+static void raw_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip_flush_pending_frames(sk);
+ release_sock(sk);
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int ret = -EINVAL;
+ int chk_addr_ret;
+
+ if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+ goto out;
+ chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+ ret = -EADDRNOTAVAIL;
+ if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ goto out;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->inet_saddr = 0; /* Use device */
+ sk_dst_reset(sk);
+ ret = 0;
+out: return ret;
+}
+
+/*
+ * This should be easy, if there is something there
+ * we return it, otherwise we block.
+ */
+
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sk_buff *skb;
+
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (flags & MSG_ERRQUEUE) {
+ err = ip_recv_error(sk, msg, len, addr_len);
+ goto out;
+ }
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ sin->sin_port = 0;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ if (err)
+ return err;
+ return copied;
+}
+
+static int raw_init(struct sock *sk)
+{
+ struct raw_sock *rp = raw_sk(sk);
+
+ if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
+ memset(&rp->filter, 0, sizeof(rp->filter));
+ return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+{
+ if (optlen > sizeof(struct icmp_filter))
+ optlen = sizeof(struct icmp_filter);
+ if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+ int len, ret = -EFAULT;
+
+ if (get_user(len, optlen))
+ goto out;
+ ret = -EINVAL;
+ if (len < 0)
+ goto out;
+ if (len > sizeof(struct icmp_filter))
+ len = sizeof(struct icmp_filter);
+ ret = -EFAULT;
+ if (put_user(len, optlen) ||
+ copy_to_user(optval, &raw_sk(sk)->filter, len))
+ goto out;
+ ret = 0;
+out: return ret;
+}
+
+static int do_raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (optname == ICMP_FILTER) {
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ else
+ return raw_seticmpfilter(sk, optval, optlen);
+ }
+ return -ENOPROTOOPT;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_RAW)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+ return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_RAW)
+ return compat_ip_setsockopt(sk, level, optname, optval, optlen);
+ return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (optname == ICMP_FILTER) {
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ else
+ return raw_geticmpfilter(sk, optval, optlen);
+ }
+ return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_RAW)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+ return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_RAW)
+ return compat_ip_getsockopt(sk, level, optname, optval, optlen);
+ return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ: {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount = skb->len;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
+struct proto raw_prot = {
+ .name = "RAW",
+ .owner = THIS_MODULE,
+ .close = raw_close,
+ .destroy = raw_destroy,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = raw_ioctl,
+ .init = raw_init,
+ .setsockopt = raw_setsockopt,
+ .getsockopt = raw_getsockopt,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .bind = raw_bind,
+ .backlog_rcv = raw_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = raw_hash_sk,
+ .unhash = raw_unhash_sk,
+ .obj_size = sizeof(struct raw_sock),
+ .h.raw_hash = &raw_v4_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_raw_setsockopt,
+ .compat_getsockopt = compat_raw_getsockopt,
+ .compat_ioctl = compat_raw_ioctl,
+#endif
+};
+
+#ifdef CONFIG_PROC_FS
+static struct sock *raw_get_first(struct seq_file *seq)
+{
+ struct sock *sk;
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+ ++state->bucket) {
+ sk_for_each(sk, &state->h->ht[state->bucket])
+ if (sock_net(sk) == seq_file_net(seq))
+ goto found;
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ do {
+ sk = sk_next(sk);
+try_again:
+ ;
+ } while (sk && sock_net(sk) != seq_file_net(seq));
+
+ if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
+ sk = sk_head(&state->h->ht[state->bucket]);
+ goto try_again;
+ }
+ return sk;
+}
+
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = raw_get_first(seq);
+
+ if (sk)
+ while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ read_lock(&state->h->lock);
+ return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(raw_seq_start);
+
+void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = raw_get_first(seq);
+ else
+ sk = raw_get_next(seq, v);
+ ++*pos;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(raw_seq_next);
+
+void raw_seq_stop(struct seq_file *seq, void *v)
+{
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ read_unlock(&state->h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_seq_stop);
+
+static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr,
+ src = inet->inet_rcv_saddr;
+ __u16 destp = 0,
+ srcp = inet->inet_num;
+
+ seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
+ i, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+}
+
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops\n");
+ else
+ raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
+ return 0;
+}
+
+static const struct seq_operations raw_seq_ops = {
+ .start = raw_seq_start,
+ .next = raw_seq_next,
+ .stop = raw_seq_stop,
+ .show = raw_seq_show,
+};
+
+int raw_seq_open(struct inode *ino, struct file *file,
+ struct raw_hashinfo *h, const struct seq_operations *ops)
+{
+ int err;
+ struct raw_iter_state *i;
+
+ err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
+ if (err < 0)
+ return err;
+
+ i = raw_seq_private((struct seq_file *)file->private_data);
+ i->h = h;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_seq_open);
+
+static int raw_v4_seq_open(struct inode *inode, struct file *file)
+{
+ return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
+}
+
+static const struct file_operations raw_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = raw_v4_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static __net_init int raw_init_net(struct net *net)
+{
+ if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void raw_exit_net(struct net *net)
+{
+ remove_proc_entry("raw", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations raw_net_ops = {
+ .init = raw_init_net,
+ .exit = raw_exit_net,
+};
+
+int __init raw_proc_init(void)
+{
+ return register_pernet_subsys(&raw_net_ops);
+}
+
+void __init raw_proc_exit(void)
+{
+ unregister_pernet_subsys(&raw_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 000000000..f45f2a12f
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,2800 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * ROUTE - implementation of the IP router.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ * Alan Cox : Verify area fixes.
+ * Alan Cox : cli() protects routing changes
+ * Rui Oliveira : ICMP routing table updates
+ * (rco@di.uminho.pt) Routing table insertion and update
+ * Linus Torvalds : Rewrote bits to be sensible
+ * Alan Cox : Added BSD route gw semantics
+ * Alan Cox : Super /proc >4K
+ * Alan Cox : MTU in route table
+ * Alan Cox : MSS actually. Also added the window
+ * clamper.
+ * Sam Lantinga : Fixed route matching in rt_del()
+ * Alan Cox : Routing cache support.
+ * Alan Cox : Removed compatibility cruft.
+ * Alan Cox : RTF_REJECT support.
+ * Alan Cox : TCP irtt support.
+ * Jonathan Naylor : Added Metric support.
+ * Miquel van Smoorenburg : BSD API fixes.
+ * Miquel van Smoorenburg : Metrics.
+ * Alan Cox : Use __u32 properly
+ * Alan Cox : Aligned routing errors more closely with BSD
+ * our system is still very different.
+ * Alan Cox : Faster /proc handling
+ * Alexey Kuznetsov : Massive rework to support tree based routing,
+ * routing caches and better behaviour.
+ *
+ * Olaf Erb : irtt wasn't being copied right.
+ * Bjorn Ekwall : Kerneld route support.
+ * Alan Cox : Multicast fixed (I hope)
+ * Pavel Krauz : Limited broadcast fixed
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov : End of old history. Split to fib.c and
+ * route.c and rewritten from scratch.
+ * Andi Kleen : Load-limit warning messages.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
+ * Robert Olsson : Added rt_cache statistics
+ * Arnaldo C. Melo : Convert proc stuff to seq_file
+ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
+ * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
+ * Ilia Sotnikov : Removed TOS from hash calculations
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/netevent.h>
+#include <net/rtnetlink.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#include <linux/kmemleak.h>
+#endif
+#include <net/secure_seq.h>
+
+#define RT_FL_TOS(oldflp4) \
+ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+static int ip_rt_max_size;
+static int ip_rt_redirect_number __read_mostly = 9;
+static int ip_rt_redirect_load __read_mostly = HZ / 50;
+static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost __read_mostly = HZ;
+static int ip_rt_error_burst __read_mostly = 5 * HZ;
+static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
+static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
+static int ip_rt_min_advmss __read_mostly = 256;
+
+/*
+ * Interface to generic destination cache.
+ */
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
+static unsigned int ipv4_mtu(const struct dst_entry *dst);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void ipv4_link_failure(struct sk_buff *skb);
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu);
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static void ipv4_dst_destroy(struct dst_entry *dst);
+
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ WARN_ON(1);
+ return NULL;
+}
+
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr);
+
+static struct dst_ops ipv4_dst_ops = {
+ .family = AF_INET,
+ .check = ipv4_dst_check,
+ .default_advmss = ipv4_default_advmss,
+ .mtu = ipv4_mtu,
+ .cow_metrics = ipv4_cow_metrics,
+ .destroy = ipv4_dst_destroy,
+ .negative_advice = ipv4_negative_advice,
+ .link_failure = ipv4_link_failure,
+ .update_pmtu = ip_rt_update_pmtu,
+ .redirect = ip_do_redirect,
+ .local_out = __ip_local_out,
+ .neigh_lookup = ipv4_neigh_lookup,
+};
+
+#define ECN_OR_COST(class) TC_PRIO_##class
+
+const __u8 ip_tos2prio[16] = {
+ TC_PRIO_BESTEFFORT,
+ ECN_OR_COST(BESTEFFORT),
+ TC_PRIO_BESTEFFORT,
+ ECN_OR_COST(BESTEFFORT),
+ TC_PRIO_BULK,
+ ECN_OR_COST(BULK),
+ TC_PRIO_BULK,
+ ECN_OR_COST(BULK),
+ TC_PRIO_INTERACTIVE,
+ ECN_OR_COST(INTERACTIVE),
+ TC_PRIO_INTERACTIVE,
+ ECN_OR_COST(INTERACTIVE),
+ TC_PRIO_INTERACTIVE_BULK,
+ ECN_OR_COST(INTERACTIVE_BULK),
+ TC_PRIO_INTERACTIVE_BULK,
+ ECN_OR_COST(INTERACTIVE_BULK)
+};
+EXPORT_SYMBOL(ip_tos2prio);
+
+static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+
+#ifdef CONFIG_PROC_FS
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+ return SEQ_START_TOKEN;
+}
+
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "%-127s\n",
+ "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+ "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+ "HHUptod\tSpecDst");
+ return 0;
+}
+
+static const struct seq_operations rt_cache_seq_ops = {
+ .start = rt_cache_seq_start,
+ .next = rt_cache_seq_next,
+ .stop = rt_cache_seq_stop,
+ .show = rt_cache_seq_show,
+};
+
+static int rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rt_cache_seq_ops);
+}
+
+static const struct file_operations rt_cache_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_cache_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return &per_cpu(rt_cache_stat, cpu);
+ }
+ return NULL;
+}
+
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return &per_cpu(rt_cache_stat, cpu);
+ }
+ return NULL;
+
+}
+
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct rt_cache_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+ return 0;
+ }
+
+ seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
+ " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+ dst_entries_get_slow(&ipv4_dst_ops),
+ 0, /* st->in_hit */
+ st->in_slow_tot,
+ st->in_slow_mc,
+ st->in_no_route,
+ st->in_brd,
+ st->in_martian_dst,
+ st->in_martian_src,
+
+ 0, /* st->out_hit */
+ st->out_slow_tot,
+ st->out_slow_mc,
+
+ 0, /* st->gc_total */
+ 0, /* st->gc_ignored */
+ 0, /* st->gc_goal_miss */
+ 0, /* st->gc_dst_overflow */
+ 0, /* st->in_hlist_search */
+ 0 /* st->out_hlist_search */
+ );
+ return 0;
+}
+
+static const struct seq_operations rt_cpu_seq_ops = {
+ .start = rt_cpu_seq_start,
+ .next = rt_cpu_seq_next,
+ .stop = rt_cpu_seq_stop,
+ .show = rt_cpu_seq_show,
+};
+
+
+static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rt_cpu_seq_ops);
+}
+
+static const struct file_operations rt_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static int rt_acct_proc_show(struct seq_file *m, void *v)
+{
+ struct ip_rt_acct *dst, *src;
+ unsigned int i, j;
+
+ dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+ for (j = 0; j < 256; j++) {
+ dst[j].o_bytes += src[j].o_bytes;
+ dst[j].o_packets += src[j].o_packets;
+ dst[j].i_bytes += src[j].i_bytes;
+ dst[j].i_packets += src[j].i_packets;
+ }
+ }
+
+ seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+ kfree(dst);
+ return 0;
+}
+
+static int rt_acct_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rt_acct_proc_show, NULL);
+}
+
+static const struct file_operations rt_acct_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_acct_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static int __net_init ip_rt_do_proc_init(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+ &rt_cache_seq_fops);
+ if (!pde)
+ goto err1;
+
+ pde = proc_create("rt_cache", S_IRUGO,
+ net->proc_net_stat, &rt_cpu_seq_fops);
+ if (!pde)
+ goto err2;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+ if (!pde)
+ goto err3;
+#endif
+ return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+err3:
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+#endif
+err2:
+ remove_proc_entry("rt_cache", net->proc_net);
+err1:
+ return -ENOMEM;
+}
+
+static void __net_exit ip_rt_do_proc_exit(struct net *net)
+{
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+ remove_proc_entry("rt_cache", net->proc_net);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ remove_proc_entry("rt_acct", net->proc_net);
+#endif
+}
+
+static struct pernet_operations ip_rt_proc_ops __net_initdata = {
+ .init = ip_rt_do_proc_init,
+ .exit = ip_rt_do_proc_exit,
+};
+
+static int __init ip_rt_proc_init(void)
+{
+ return register_pernet_subsys(&ip_rt_proc_ops);
+}
+
+#else
+static inline int ip_rt_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+static inline bool rt_is_expired(const struct rtable *rth)
+{
+ return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+}
+
+void rt_cache_flush(struct net *net)
+{
+ rt_genid_bump_ipv4(net);
+}
+
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ const struct rtable *rt;
+ struct neighbour *n;
+
+ rt = (const struct rtable *) dst;
+ if (rt->rt_gateway)
+ pkey = (const __be32 *) &rt->rt_gateway;
+ else if (skb)
+ pkey = &ip_hdr(skb)->daddr;
+
+ n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
+ if (n)
+ return n;
+ return neigh_create(&arp_tbl, pkey, dev);
+}
+
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+ atomic_t id;
+ u32 stamp32;
+};
+
+static struct ip_ident_bucket *ip_idents __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
+ */
+u32 ip_idents_reserve(u32 hash, int segs)
+{
+ struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 now = (u32)jiffies;
+ u32 delta = 0;
+
+ if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ delta = prandom_u32_max(now - old);
+
+ return atomic_add_return(segs + delta, &bucket->id) - segs;
+}
+EXPORT_SYMBOL(ip_idents_reserve);
+
+void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
+{
+ static u32 ip_idents_hashrnd __read_mostly;
+ u32 hash, id;
+
+ net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+
+ hash = jhash_3words((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+ iph->protocol ^ net_hash_mix(net),
+ ip_idents_hashrnd);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
+}
+EXPORT_SYMBOL(__ip_select_ident);
+
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct iphdr *iph,
+ int oif, u8 tos,
+ u8 prot, u32 mark, int flow_flags)
+{
+ if (sk) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ oif = sk->sk_bound_dev_if;
+ mark = sk->sk_mark;
+ tos = RT_CONN_FLAGS(sk);
+ prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+ }
+ flowi4_init_output(fl4, oif, mark, tos,
+ RT_SCOPE_UNIVERSE, prot,
+ flow_flags,
+ iph->daddr, iph->saddr, 0, 0);
+}
+
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+ const struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+
+ __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
+
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ daddr, inet->inet_saddr, 0, 0);
+ rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ if (skb)
+ build_skb_flow_key(fl4, skb, sk);
+ else
+ build_sk_flow_key(fl4, sk);
+}
+
+static inline void rt_free(struct rtable *rt)
+{
+ call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
+
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+ struct rtable *rt;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+ rt_free(rt);
+ }
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+ rt_free(rt);
+ }
+}
+
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+{
+ struct fib_nh_exception *fnhe, *oldest;
+
+ oldest = rcu_dereference(hash->chain);
+ for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ oldest = fnhe;
+ }
+ fnhe_flush_routes(oldest);
+ return oldest;
+}
+
+static inline u32 fnhe_hashfun(__be32 daddr)
+{
+ static u32 fnhe_hashrnd __read_mostly;
+ u32 hval;
+
+ net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
+ hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
+ return hash_32(hval, FNHE_HASH_SHIFT);
+}
+
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+ rt->rt_pmtu = fnhe->fnhe_pmtu;
+ rt->dst.expires = fnhe->fnhe_expires;
+
+ if (fnhe->fnhe_gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_gateway = fnhe->fnhe_gw;
+ rt->rt_uses_gateway = 1;
+ }
+}
+
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+ u32 pmtu, unsigned long expires)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe;
+ struct rtable *rt;
+ unsigned int i;
+ int depth;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference(nh->nh_exceptions);
+ if (!hash) {
+ hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ goto out_unlock;
+ rcu_assign_pointer(nh->nh_exceptions, hash);
+ }
+
+ hash += hval;
+
+ depth = 0;
+ for (fnhe = rcu_dereference(hash->chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ break;
+ depth++;
+ }
+
+ if (fnhe) {
+ if (gw)
+ fnhe->fnhe_gw = gw;
+ if (pmtu) {
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = max(1UL, expires);
+ }
+ /* Update all cached dsts too */
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ } else {
+ if (depth > FNHE_RECLAIM_DEPTH)
+ fnhe = fnhe_oldest(hash);
+ else {
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+ rcu_assign_pointer(hash->chain, fnhe);
+ }
+ fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
+ fnhe->fnhe_daddr = daddr;
+ fnhe->fnhe_gw = gw;
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = expires;
+
+ /* Exception created; mark the cached routes for the nexthop
+ * stale, so anyone caching it rechecks if this exception
+ * applies to them.
+ */
+ rt = rcu_dereference(nh->nh_rth_input);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+
+ for_each_possible_cpu(i) {
+ struct rtable __rcu **prt;
+ prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+ rt = rcu_dereference(*prt);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+ }
+ }
+
+ fnhe->fnhe_stamp = jiffies;
+
+out_unlock:
+ spin_unlock_bh(&fnhe_lock);
+}
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+ bool kill_route)
+{
+ __be32 new_gw = icmp_hdr(skb)->un.gateway;
+ __be32 old_gw = ip_hdr(skb)->saddr;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct neighbour *n;
+ struct net *net;
+
+ switch (icmp_hdr(skb)->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ break;
+
+ default:
+ return;
+ }
+
+ if (rt->rt_gateway != old_gw)
+ return;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return;
+
+ net = dev_net(dev);
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+ ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+ ipv4_is_zeronet(new_gw))
+ goto reject_redirect;
+
+ if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+ if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+ goto reject_redirect;
+ if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+ goto reject_redirect;
+ } else {
+ if (inet_addr_type(net, new_gw) != RTN_UNICAST)
+ goto reject_redirect;
+ }
+
+ n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ if (!IS_ERR(n)) {
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ if (fib_lookup(net, fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ update_or_create_fnhe(nh, fl4->daddr, new_gw,
+ 0, 0);
+ }
+ if (kill_route)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+ }
+ neigh_release(n);
+ }
+ return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev)) {
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ __be32 daddr = iph->daddr;
+ __be32 saddr = iph->saddr;
+
+ net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+ " Advised path = %pI4 -> %pI4\n",
+ &old_gw, dev->name, &new_gw,
+ &saddr, &daddr);
+ }
+#endif
+ ;
+}
+
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt;
+ struct flowi4 fl4;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+
+ rt = (struct rtable *) dst;
+
+ __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __ip_do_redirect(rt, skb, &fl4, true);
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *)dst;
+ struct dst_entry *ret = dst;
+
+ if (rt) {
+ if (dst->obsolete > 0) {
+ ip_rt_put(rt);
+ ret = NULL;
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->dst.expires) {
+ ip_rt_put(rt);
+ ret = NULL;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Algorithm:
+ * 1. The first ip_rt_redirect_number redirects are sent
+ * with exponential backoff, then we stop sending them at all,
+ * assuming that the host ignores our redirects.
+ * 2. If we did not see packets requiring redirects
+ * during ip_rt_redirect_silence, we assume that the host
+ * forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct inet_peer *peer;
+ struct net *net;
+ int log_martians;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
+ rcu_read_unlock();
+ return;
+ }
+ log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+ rcu_read_unlock();
+
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ if (!peer) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+ rt_nexthop(rt, ip_hdr(skb)->daddr));
+ return;
+ }
+
+ /* No redirected packets during ip_rt_redirect_silence;
+ * reset the algorithm.
+ */
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ peer->rate_tokens = 0;
+
+ /* Too many ignored redirects; do not send anything
+ * set dst.rate_last to the last seen redirected packet.
+ */
+ if (peer->rate_tokens >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
+ goto out_put_peer;
+ }
+
+ /* Check for load limit; set rate_last to the latest sent
+ * redirect.
+ */
+ if (peer->rate_tokens == 0 ||
+ time_after(jiffies,
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->rate_tokens)))) {
+ __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
+ peer->rate_last = jiffies;
+ ++peer->rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (log_martians &&
+ peer->rate_tokens == ip_rt_redirect_number)
+ net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+ &ip_hdr(skb)->saddr, inet_iif(skb),
+ &ip_hdr(skb)->daddr, &gw);
+#endif
+ }
+out_put_peer:
+ inet_putpeer(peer);
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct rtable *rt = skb_rtable(skb);
+ struct inet_peer *peer;
+ unsigned long now;
+ struct net *net;
+ bool send;
+ int code;
+
+ /* IP on this device is disabled. */
+ if (!in_dev)
+ goto out;
+
+ net = dev_net(rt->dst.dev);
+ if (!IN_DEV_FORWARD(in_dev)) {
+ switch (rt->dst.error) {
+ case EHOSTUNREACH:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+ break;
+
+ case ENETUNREACH:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ }
+ goto out;
+ }
+
+ switch (rt->dst.error) {
+ case EINVAL:
+ default:
+ goto out;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
+ }
+
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
+ inet_putpeer(peer);
+ }
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+
+out: kfree_skb(skb);
+ return 0;
+}
+
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+{
+ struct dst_entry *dst = &rt->dst;
+ struct fib_result res;
+
+ if (dst_metric_locked(dst, RTAX_MTU))
+ return;
+
+ if (ipv4_mtu(dst) < mtu)
+ return;
+
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
+
+ if (rt->rt_pmtu == mtu &&
+ time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
+ return;
+
+ rcu_read_lock();
+ if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+ jiffies + ip_rt_mtu_expires);
+ }
+ rcu_read_unlock();
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct flowi4 fl4;
+
+ ip_rt_build_flow_key(&fl4, sk, skb);
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+ int oif, u32 mark, u8 protocol, int flow_flags)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ if (!mark)
+ mark = IP4_REPLY_MARK(net, skb->mark);
+
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
+
+static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!fl4.flowi4_mark)
+ fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
+ }
+}
+
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct dst_entry *odst = NULL;
+ bool new = false;
+
+ bh_lock_sock(sk);
+
+ if (!ip_sk_accept_pmtu(sk))
+ goto out;
+
+ odst = sk_dst_get(sk);
+
+ if (sock_owned_by_user(sk) || !odst) {
+ __ipv4_sk_update_pmtu(skb, sk, mtu);
+ goto out;
+ }
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ rt = (struct rtable *)odst;
+ if (odst->obsolete && !odst->ops->check(odst, 0)) {
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+
+ if (!dst_check(&rt->dst, 0)) {
+ if (new)
+ dst_release(&rt->dst);
+
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ if (new)
+ sk_dst_set(sk, &rt->dst);
+
+out:
+ bh_unlock_sock(sk);
+ dst_release(odst);
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
+
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+ int oif, u32 mark, u8 protocol, int flow_flags)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
+
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ /* All IPV4 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ *
+ * When a PMTU/redirect information update invalidates a route,
+ * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+ * DST_OBSOLETE_DEAD by dst_free().
+ */
+ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+ return NULL;
+ return dst;
+}
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+ struct rtable *rt;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+ rt = skb_rtable(skb);
+ if (rt)
+ dst_set_expires(&rt->dst, 0);
+}
+
+static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+{
+ pr_debug("%s: %pI4 -> %pI4, %s\n",
+ __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ skb->dev ? skb->dev->name : "?");
+ kfree_skb(skb);
+ WARN_ON(1);
+ return 0;
+}
+
+/*
+ We do not cache source address of outgoing interface,
+ because it is used only by IP RR, TS and SRR options,
+ so that it out of fast path.
+
+ BTW remember: "addr" is allowed to be not aligned
+ in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
+{
+ __be32 src;
+
+ if (rt_is_output_route(rt))
+ src = ip_hdr(skb)->saddr;
+ else {
+ struct fib_result res;
+ struct flowi4 fl4;
+ struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = rt->dst.dev->ifindex;
+ fl4.flowi4_iif = skb->dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+
+ rcu_read_lock();
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+ src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+ else
+ src = inet_select_addr(rt->dst.dev,
+ rt_nexthop(rt, iph->daddr),
+ RT_SCOPE_UNIVERSE);
+ rcu_read_unlock();
+ }
+ memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+ if (!(rt->dst.tclassid & 0xFFFF))
+ rt->dst.tclassid |= tag & 0xFFFF;
+ if (!(rt->dst.tclassid & 0xFFFF0000))
+ rt->dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
+{
+ unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+
+ if (advmss == 0) {
+ advmss = max_t(unsigned int, dst->dev->mtu - 40,
+ ip_rt_min_advmss);
+ if (advmss > 65535 - 40)
+ advmss = 65535 - 40;
+ }
+ return advmss;
+}
+
+static unsigned int ipv4_mtu(const struct dst_entry *dst)
+{
+ const struct rtable *rt = (const struct rtable *) dst;
+ unsigned int mtu = rt->rt_pmtu;
+
+ if (!mtu || time_after_eq(jiffies, rt->dst.expires))
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu)
+ return mtu;
+
+ mtu = dst->dev->mtu;
+
+ if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+ if (rt->rt_uses_gateway && mtu > 576)
+ mtu = 576;
+ }
+
+ return min_t(unsigned int, mtu, IP_MAX_MTU);
+}
+
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
+ struct fib_nh_exception *fnhe;
+ u32 hval;
+
+ if (!hash)
+ return NULL;
+
+ hval = fnhe_hashfun(daddr);
+
+ for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ return fnhe;
+ }
+ return NULL;
+}
+
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ __be32 daddr)
+{
+ bool ret = false;
+
+ spin_lock_bh(&fnhe_lock);
+
+ if (daddr == fnhe->fnhe_daddr) {
+ struct rtable __rcu **porig;
+ struct rtable *orig;
+ int genid = fnhe_genid(dev_net(rt->dst.dev));
+
+ if (rt_is_input_route(rt))
+ porig = &fnhe->fnhe_rth_input;
+ else
+ porig = &fnhe->fnhe_rth_output;
+ orig = rcu_dereference(*porig);
+
+ if (fnhe->fnhe_genid != genid) {
+ fnhe->fnhe_genid = genid;
+ fnhe->fnhe_gw = 0;
+ fnhe->fnhe_pmtu = 0;
+ fnhe->fnhe_expires = 0;
+ fnhe_flush_routes(fnhe);
+ orig = NULL;
+ }
+ fill_route_from_fnhe(rt, fnhe);
+ if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+
+ if (!(rt->dst.flags & DST_NOCACHE)) {
+ rcu_assign_pointer(*porig, rt);
+ if (orig)
+ rt_free(orig);
+ ret = true;
+ }
+
+ fnhe->fnhe_stamp = jiffies;
+ }
+ spin_unlock_bh(&fnhe_lock);
+
+ return ret;
+}
+
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+ struct rtable *orig, *prev, **p;
+ bool ret = true;
+
+ if (rt_is_input_route(rt)) {
+ p = (struct rtable **)&nh->nh_rth_input;
+ } else {
+ p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
+ orig = *p;
+
+ prev = cmpxchg(p, orig, rt);
+ if (prev == orig) {
+ if (orig)
+ rt_free(orig);
+ } else
+ ret = false;
+
+ return ret;
+}
+
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+ struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
+
+ rt->rt_uncached_list = ul;
+
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->rt_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (!list_empty(&rt->rt_uncached)) {
+ struct uncached_list *ul = rt->rt_uncached_list;
+
+ spin_lock_bh(&ul->lock);
+ list_del(&rt->rt_uncached);
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct rtable *rt;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+ spin_lock_bh(&ul->lock);
+ list_for_each_entry(rt, &ul->head, rt_uncached) {
+ if (rt->dst.dev != dev)
+ continue;
+ rt->dst.dev = net->loopback_dev;
+ dev_hold(rt->dst.dev);
+ dev_put(dev);
+ }
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+static bool rt_cache_valid(const struct rtable *rt)
+{
+ return rt &&
+ rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ !rt_is_expired(rt);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
+ const struct fib_result *res,
+ struct fib_nh_exception *fnhe,
+ struct fib_info *fi, u16 type, u32 itag)
+{
+ bool cached = false;
+
+ if (fi) {
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
+ rt->rt_gateway = nh->nh_gw;
+ rt->rt_uses_gateway = 1;
+ }
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rt->dst.tclassid = nh->nh_tclassid;
+#endif
+ if (unlikely(fnhe))
+ cached = rt_bind_exception(rt, fnhe, daddr);
+ else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_cache_route(nh, rt);
+ if (unlikely(!cached)) {
+ /* Routes we intend to cache in nexthop exception or
+ * FIB nexthop have the DST_NOCACHE bit clear.
+ * However, if we are unsuccessful at storing this
+ * route into the cache we really need to set it.
+ */
+ rt->dst.flags |= DST_NOCACHE;
+ if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+ rt_add_uncached_list(rt);
+ }
+ } else
+ rt_add_uncached_list(rt);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ set_class_tag(rt, res->tclassid);
+#endif
+ set_class_tag(rt, itag);
+#endif
+}
+
+static struct rtable *rt_dst_alloc(struct net_device *dev,
+ bool nopolicy, bool noxfrm, bool will_cache)
+{
+ return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0));
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, int our)
+{
+ struct rtable *rth;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ u32 itag = 0;
+ int err;
+
+ /* Primary sanity checks. */
+
+ if (!in_dev)
+ return -EINVAL;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ skb->protocol != htons(ETH_P_IP))
+ goto e_inval;
+
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(saddr))
+ goto e_inval;
+
+ if (ipv4_is_zeronet(saddr)) {
+ if (!ipv4_is_local_multicast(daddr))
+ goto e_inval;
+ } else {
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
+ if (err < 0)
+ goto e_err;
+ }
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+ if (!rth)
+ goto e_nobufs;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rth->dst.tclassid = itag;
+#endif
+ rth->dst.output = ip_rt_bug;
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev));
+ rth->rt_flags = RTCF_MULTICAST;
+ rth->rt_type = RTN_MULTICAST;
+ rth->rt_is_input= 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ if (our) {
+ rth->dst.input= ip_local_deliver;
+ rth->rt_flags |= RTCF_LOCAL;
+ }
+
+#ifdef CONFIG_IP_MROUTE
+ if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
+ rth->dst.input = ip_mr_input;
+#endif
+ RT_CACHE_STAT_INC(in_slow_mc);
+
+ skb_dst_set(skb, &rth->dst);
+ return 0;
+
+e_nobufs:
+ return -ENOBUFS;
+e_inval:
+ return -EINVAL;
+e_err:
+ return err;
+}
+
+
+static void ip_handle_martian_source(struct net_device *dev,
+ struct in_device *in_dev,
+ struct sk_buff *skb,
+ __be32 daddr,
+ __be32 saddr)
+{
+ RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+ /*
+ * RFC1812 recommendation, if source is martian,
+ * the only hint is MAC header.
+ */
+ pr_warn("martian source %pI4 from %pI4, on dev %s\n",
+ &daddr, &saddr, dev->name);
+ if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
+ print_hex_dump(KERN_WARNING, "ll header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ skb_mac_header(skb),
+ dev->hard_header_len, true);
+ }
+ }
+#endif
+}
+
+/* called in rcu_read_lock() section */
+static int __mkroute_input(struct sk_buff *skb,
+ const struct fib_result *res,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos)
+{
+ struct fib_nh_exception *fnhe;
+ struct rtable *rth;
+ int err;
+ struct in_device *out_dev;
+ unsigned int flags = 0;
+ bool do_cache;
+ u32 itag = 0;
+
+ /* get a working reference to the output device */
+ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
+ if (!out_dev) {
+ net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
+ return -EINVAL;
+ }
+
+ err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
+ in_dev->dev, in_dev, &itag);
+ if (err < 0) {
+ ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+ saddr);
+
+ goto cleanup;
+ }
+
+ do_cache = res->fi && !itag;
+ if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+ skb->protocol == htons(ETH_P_IP) &&
+ (IN_DEV_SHARED_MEDIA(out_dev) ||
+ inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+ IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+
+ if (skb->protocol != htons(ETH_P_IP)) {
+ /* Not IP (i.e. ARP). Do not create route, if it is
+ * invalid for proxy arp. DNAT routes are always valid.
+ *
+ * Proxy arp feature have been extended to allow, ARP
+ * replies back to the same interface, to support
+ * Private VLAN switch technologies. See arp.c.
+ */
+ if (out_dev == in_dev &&
+ IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
+ err = -EINVAL;
+ goto cleanup;
+ }
+ }
+
+ fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+ if (do_cache) {
+ if (fnhe)
+ rth = rcu_dereference(fnhe->fnhe_rth_input);
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ goto out;
+ }
+ }
+
+ rth = rt_dst_alloc(out_dev->dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+ if (!rth) {
+ err = -ENOBUFS;
+ goto cleanup;
+ }
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
+ rth->rt_flags = flags;
+ rth->rt_type = res->type;
+ rth->rt_is_input = 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
+
+ rth->dst.input = ip_forward;
+ rth->dst.output = ip_output;
+
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ skb_dst_set(skb, &rth->dst);
+out:
+ err = 0;
+ cleanup:
+ return err;
+}
+
+static int ip_mkroute_input(struct sk_buff *skb,
+ struct fib_result *res,
+ const struct flowi4 *fl4,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi && res->fi->fib_nhs > 1)
+ fib_select_multipath(res);
+#endif
+
+ /* create a routing cache entry */
+ return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+}
+
+/*
+ * NOTE. We drop all the packets that has local source
+ * addresses, because every properly looped back packet
+ * must have correct destination already attached by output routine.
+ *
+ * Such approach solves two big problems:
+ * 1. Not simplex devices are handled properly.
+ * 2. IP spoofing attempts are filtered with 100% of guarantee.
+ * called with rcu_read_lock()
+ */
+
+static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ struct fib_result res;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct flowi4 fl4;
+ unsigned int flags = 0;
+ u32 itag = 0;
+ struct rtable *rth;
+ int err = -EINVAL;
+ struct net *net = dev_net(dev);
+ bool do_cache;
+
+ /* IP on this device is disabled. */
+
+ if (!in_dev)
+ goto out;
+
+ /* Check for the most weird martians, which can be not detected
+ by fib_lookup.
+ */
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ goto martian_source;
+
+ res.fi = NULL;
+ if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
+ goto brd_input;
+
+ /* Accept zero addresses only to limited broadcast;
+ * I even do not know to fix it or not. Waiting for complains :-)
+ */
+ if (ipv4_is_zeronet(saddr))
+ goto martian_source;
+
+ if (ipv4_is_zeronet(daddr))
+ goto martian_destination;
+
+ /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+ * and call it once if daddr or/and saddr are loopback addresses
+ */
+ if (ipv4_is_loopback(daddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_destination;
+ } else if (ipv4_is_loopback(saddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+ }
+
+ /*
+ * Now we are ready to route packet.
+ */
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ err = fib_lookup(net, &fl4, &res);
+ if (err != 0) {
+ if (!IN_DEV_FORWARD(in_dev))
+ err = -EHOSTUNREACH;
+ goto no_route;
+ }
+
+ if (res.type == RTN_BROADCAST)
+ goto brd_input;
+
+ if (res.type == RTN_LOCAL) {
+ err = fib_validate_source(skb, saddr, daddr, tos,
+ 0, dev, in_dev, &itag);
+ if (err < 0)
+ goto martian_source_keep_err;
+ goto local_input;
+ }
+
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
+ goto no_route;
+ }
+ if (res.type != RTN_UNICAST)
+ goto martian_destination;
+
+ err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+out: return err;
+
+brd_input:
+ if (skb->protocol != htons(ETH_P_IP))
+ goto e_inval;
+
+ if (!ipv4_is_zeronet(saddr)) {
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
+ if (err < 0)
+ goto martian_source_keep_err;
+ }
+ flags |= RTCF_BROADCAST;
+ res.type = RTN_BROADCAST;
+ RT_CACHE_STAT_INC(in_brd);
+
+local_input:
+ do_cache = false;
+ if (res.fi) {
+ if (!itag) {
+ rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ err = 0;
+ goto out;
+ }
+ do_cache = true;
+ }
+ }
+
+ rth = rt_dst_alloc(net->loopback_dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+ if (!rth)
+ goto e_nobufs;
+
+ rth->dst.input= ip_local_deliver;
+ rth->dst.output= ip_rt_bug;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rth->dst.tclassid = itag;
+#endif
+
+ rth->rt_genid = rt_genid_ipv4(net);
+ rth->rt_flags = flags|RTCF_LOCAL;
+ rth->rt_type = res.type;
+ rth->rt_is_input = 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
+ if (res.type == RTN_UNREACHABLE) {
+ rth->dst.input= ip_error;
+ rth->dst.error= -err;
+ rth->rt_flags &= ~RTCF_LOCAL;
+ }
+ if (do_cache) {
+ if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ rth->dst.flags |= DST_NOCACHE;
+ rt_add_uncached_list(rth);
+ }
+ }
+ skb_dst_set(skb, &rth->dst);
+ err = 0;
+ goto out;
+
+no_route:
+ RT_CACHE_STAT_INC(in_no_route);
+ res.type = RTN_UNREACHABLE;
+ res.fi = NULL;
+ goto local_input;
+
+ /*
+ * Do not cache martian addresses: they should be logged (RFC1812)
+ */
+martian_destination:
+ RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
+#endif
+
+e_inval:
+ err = -EINVAL;
+ goto out;
+
+e_nobufs:
+ err = -ENOBUFS;
+ goto out;
+
+martian_source:
+ err = -EINVAL;
+martian_source_keep_err:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ goto out;
+}
+
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ int res;
+
+ rcu_read_lock();
+
+ /* Multicast recognition logic is moved from route cache to here.
+ The problem was that too many Ethernet cards have broken/missing
+ hardware multicast filters :-( As result the host on multicasting
+ network acquires a lot of useless route cache entries, sort of
+ SDR messages from all the world. Now we try to get rid of them.
+ Really, provided software IP multicast filter is organized
+ reasonably (at least, hashed), it does not result in a slowdown
+ comparing with route cache reject entries.
+ Note, that multicast routers are not affected, because
+ route cache entry is created eventually.
+ */
+ if (ipv4_is_multicast(daddr)) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev) {
+ int our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+ if (our
+#ifdef CONFIG_IP_MROUTE
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
+#endif
+ ) {
+ int res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
+ rcu_read_unlock();
+ return res;
+ }
+ }
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
+/* called with rcu_read_lock() */
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi4 *fl4, int orig_oif,
+ struct net_device *dev_out,
+ unsigned int flags)
+{
+ struct fib_info *fi = res->fi;
+ struct fib_nh_exception *fnhe;
+ struct in_device *in_dev;
+ u16 type = res->type;
+ struct rtable *rth;
+ bool do_cache;
+
+ in_dev = __in_dev_get_rcu(dev_out);
+ if (!in_dev)
+ return ERR_PTR(-EINVAL);
+
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ return ERR_PTR(-EINVAL);
+
+ if (ipv4_is_lbcast(fl4->daddr))
+ type = RTN_BROADCAST;
+ else if (ipv4_is_multicast(fl4->daddr))
+ type = RTN_MULTICAST;
+ else if (ipv4_is_zeronet(fl4->daddr))
+ return ERR_PTR(-EINVAL);
+
+ if (dev_out->flags & IFF_LOOPBACK)
+ flags |= RTCF_LOCAL;
+
+ do_cache = true;
+ if (type == RTN_BROADCAST) {
+ flags |= RTCF_BROADCAST | RTCF_LOCAL;
+ fi = NULL;
+ } else if (type == RTN_MULTICAST) {
+ flags |= RTCF_MULTICAST | RTCF_LOCAL;
+ if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+ fl4->flowi4_proto))
+ flags &= ~RTCF_LOCAL;
+ else
+ do_cache = false;
+ /* If multicast route do not exist use
+ * default one, but do not gateway in this case.
+ * Yes, it is hack.
+ */
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
+ }
+
+ fnhe = NULL;
+ do_cache &= fi != NULL;
+ if (do_cache) {
+ struct rtable __rcu **prth;
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ fnhe = find_exception(nh, fl4->daddr);
+ if (fnhe)
+ prth = &fnhe->fnhe_rth_output;
+ else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
+ rth = rcu_dereference(*prth);
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ return rth;
+ }
+ }
+
+add:
+ rth = rt_dst_alloc(dev_out,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(in_dev, NOXFRM),
+ do_cache);
+ if (!rth)
+ return ERR_PTR(-ENOBUFS);
+
+ rth->dst.output = ip_output;
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
+ rth->rt_flags = flags;
+ rth->rt_type = type;
+ rth->rt_is_input = 0;
+ rth->rt_iif = orig_oif ? : 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+
+ RT_CACHE_STAT_INC(out_slow_tot);
+
+ if (flags & RTCF_LOCAL)
+ rth->dst.input = ip_local_deliver;
+ if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ if (flags & RTCF_LOCAL &&
+ !(dev_out->flags & IFF_LOOPBACK)) {
+ rth->dst.output = ip_mc_output;
+ RT_CACHE_STAT_INC(out_slow_mc);
+ }
+#ifdef CONFIG_IP_MROUTE
+ if (type == RTN_MULTICAST) {
+ if (IN_DEV_MFORWARD(in_dev) &&
+ !ipv4_is_local_multicast(fl4->daddr)) {
+ rth->dst.input = ip_mr_input;
+ rth->dst.output = ip_mc_output;
+ }
+ }
+#endif
+ }
+
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+
+ return rth;
+}
+
+/*
+ * Major route resolver routine.
+ */
+
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+{
+ struct net_device *dev_out = NULL;
+ __u8 tos = RT_FL_TOS(fl4);
+ unsigned int flags = 0;
+ struct fib_result res;
+ struct rtable *rth;
+ int orig_oif;
+
+ res.tclassid = 0;
+ res.fi = NULL;
+ res.table = NULL;
+
+ orig_oif = fl4->flowi4_oif;
+
+ fl4->flowi4_iif = LOOPBACK_IFINDEX;
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+
+ rcu_read_lock();
+ if (fl4->saddr) {
+ rth = ERR_PTR(-EINVAL);
+ if (ipv4_is_multicast(fl4->saddr) ||
+ ipv4_is_lbcast(fl4->saddr) ||
+ ipv4_is_zeronet(fl4->saddr))
+ goto out;
+
+ /* I removed check for oif == dev_out->oif here.
+ It was wrong for two reasons:
+ 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ is assigned to multiple interfaces.
+ 2. Moreover, we are allowed to send packets with saddr
+ of another iface. --ANK
+ */
+
+ if (fl4->flowi4_oif == 0 &&
+ (ipv4_is_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr))) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = __ip_dev_find(net, fl4->saddr, false);
+ if (!dev_out)
+ goto out;
+
+ /* Special hack: user can direct multicasts
+ and limited broadcast via necessary interface
+ without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ This hack is not just for fun, it allows
+ vic,vat and friends to work.
+ They bind socket to loopback, set ttl to zero
+ and expect that it will work.
+ From the viewpoint of routing cache they are broken,
+ because we are not allowed to build multicast path
+ with loopback source addr (look, routing cache
+ cannot know, that ttl is zero, so that packet
+ will not leave this host and route is valid).
+ Luckily, this hack is good workaround.
+ */
+
+ fl4->flowi4_oif = dev_out->ifindex;
+ goto make_route;
+ }
+
+ if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ if (!__ip_dev_find(net, fl4->saddr, false))
+ goto out;
+ }
+ }
+
+
+ if (fl4->flowi4_oif) {
+ dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
+ rth = ERR_PTR(-ENODEV);
+ if (!dev_out)
+ goto out;
+
+ /* RACE: Check return value of inet_select_addr instead. */
+ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
+ }
+ if (ipv4_is_local_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr)) {
+ if (!fl4->saddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
+ goto make_route;
+ }
+ if (!fl4->saddr) {
+ if (ipv4_is_multicast(fl4->daddr))
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ fl4->flowi4_scope);
+ else if (!fl4->daddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_HOST);
+ }
+ }
+
+ if (!fl4->daddr) {
+ fl4->daddr = fl4->saddr;
+ if (!fl4->daddr)
+ fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
+ dev_out = net->loopback_dev;
+ fl4->flowi4_oif = LOOPBACK_IFINDEX;
+ res.type = RTN_LOCAL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+ if (fib_lookup(net, fl4, &res)) {
+ res.fi = NULL;
+ res.table = NULL;
+ if (fl4->flowi4_oif) {
+ /* Apparently, routing tables are wrong. Assume,
+ that the destination is on link.
+
+ WHY? DW.
+ Because we are allowed to send to iface
+ even if it has NO routes and NO assigned
+ addresses. When oif is specified, routing
+ tables are looked up with only one purpose:
+ to catch if destination is gatewayed, rather than
+ direct. Moreover, if MSG_DONTROUTE is set,
+ we send packet, ignoring both routing tables
+ and ifaddr state. --ANK
+
+
+ We could make it even if oif is unknown,
+ likely IPv6, but we do not.
+ */
+
+ if (fl4->saddr == 0)
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
+ res.type = RTN_UNICAST;
+ goto make_route;
+ }
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
+ }
+
+ if (res.type == RTN_LOCAL) {
+ if (!fl4->saddr) {
+ if (res.fi->fib_prefsrc)
+ fl4->saddr = res.fi->fib_prefsrc;
+ else
+ fl4->saddr = fl4->daddr;
+ }
+ dev_out = net->loopback_dev;
+ fl4->flowi4_oif = dev_out->ifindex;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+ fib_select_multipath(&res);
+ else
+#endif
+ if (!res.prefixlen &&
+ res.table->tb_num_default > 1 &&
+ res.type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(&res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, res);
+
+ dev_out = FIB_RES_DEV(res);
+ fl4->flowi4_oif = dev_out->ifindex;
+
+
+make_route:
+ rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+
+out:
+ rcu_read_unlock();
+ return rth;
+}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
+
+static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
+}
+
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+
+static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
+ unsigned long old)
+{
+ return NULL;
+}
+
+static struct dst_ops ipv4_dst_blackhole_ops = {
+ .family = AF_INET,
+ .check = ipv4_blackhole_dst_check,
+ .mtu = ipv4_blackhole_mtu,
+ .default_advmss = ipv4_default_advmss,
+ .update_pmtu = ipv4_rt_blackhole_update_pmtu,
+ .redirect = ipv4_rt_blackhole_redirect,
+ .cow_metrics = ipv4_rt_blackhole_cow_metrics,
+ .neigh_lookup = ipv4_neigh_lookup,
+};
+
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+ struct rtable *ort = (struct rtable *) dst_orig;
+ struct rtable *rt;
+
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+ if (rt) {
+ struct dst_entry *new = &rt->dst;
+
+ new->__use = 1;
+ new->input = dst_discard;
+ new->output = dst_discard_sk;
+
+ new->dev = ort->dst.dev;
+ if (new->dev)
+ dev_hold(new->dev);
+
+ rt->rt_is_input = ort->rt_is_input;
+ rt->rt_iif = ort->rt_iif;
+ rt->rt_pmtu = ort->rt_pmtu;
+
+ rt->rt_genid = rt_genid_ipv4(net);
+ rt->rt_flags = ort->rt_flags;
+ rt->rt_type = ort->rt_type;
+ rt->rt_gateway = ort->rt_gateway;
+ rt->rt_uses_gateway = ort->rt_uses_gateway;
+
+ INIT_LIST_HEAD(&rt->rt_uncached);
+
+ dst_free(new);
+ }
+
+ dst_release(dst_orig);
+
+ return rt ? &rt->dst : ERR_PTR(-ENOMEM);
+}
+
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+ struct sock *sk)
+{
+ struct rtable *rt = __ip_route_output_key(net, flp4);
+
+ if (IS_ERR(rt))
+ return rt;
+
+ if (flp4->flowi4_proto)
+ rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0);
+
+ return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+ u32 seq, int event, int nowait, unsigned int flags)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct rtmsg *r;
+ struct nlmsghdr *nlh;
+ unsigned long expires = 0;
+ u32 error;
+ u32 metrics[RTAX_MAX];
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ r->rtm_family = AF_INET;
+ r->rtm_dst_len = 32;
+ r->rtm_src_len = 0;
+ r->rtm_tos = fl4->flowi4_tos;
+ r->rtm_table = RT_TABLE_MAIN;
+ if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ goto nla_put_failure;
+ r->rtm_type = rt->rt_type;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+ r->rtm_protocol = RTPROT_UNSPEC;
+ r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
+ if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
+ r->rtm_flags |= RTCF_DOREDIRECT;
+
+ if (nla_put_in_addr(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (src) {
+ r->rtm_src_len = 32;
+ if (nla_put_in_addr(skb, RTA_SRC, src))
+ goto nla_put_failure;
+ }
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rt->dst.tclassid &&
+ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+ goto nla_put_failure;
+#endif
+ if (!rt_is_input_route(rt) &&
+ fl4->saddr != src) {
+ if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
+ goto nla_put_failure;
+ }
+ if (rt->rt_uses_gateway &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
+ goto nla_put_failure;
+
+ expires = rt->dst.expires;
+ if (expires) {
+ unsigned long now = jiffies;
+
+ if (time_before(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+ }
+
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt_pmtu && expires)
+ metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
+ goto nla_put_failure;
+
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
+
+ error = rt->dst.error;
+
+ if (rt_is_input_route(rt)) {
+#ifdef CONFIG_IP_MROUTE
+ if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ error = err;
+ }
+ }
+ } else
+#endif
+ if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct rtmsg *rtm;
+ struct nlattr *tb[RTA_MAX+1];
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ __be32 dst = 0;
+ __be32 src = 0;
+ u32 iif;
+ int err;
+ int mark;
+ struct sk_buff *skb;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ rtm = nlmsg_data(nlh);
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ /* Reserve room for dummy headers, this skb can pass
+ through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
+ ip_hdr(skb)->protocol = IPPROTO_ICMP;
+ skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+ src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
+ dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
+ iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+ mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst;
+ fl4.saddr = src;
+ fl4.flowi4_tos = rtm->rtm_tos;
+ fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+ fl4.flowi4_mark = mark;
+
+ if (iif) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_index(net, iif);
+ if (!dev) {
+ err = -ENODEV;
+ goto errout_free;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->dev = dev;
+ skb->mark = mark;
+ local_bh_disable();
+ err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ local_bh_enable();
+
+ rt = skb_rtable(skb);
+ if (err == 0 && rt->dst.error)
+ err = -rt->dst.error;
+ } else {
+ rt = ip_route_output_key(net, &fl4);
+
+ err = 0;
+ if (IS_ERR(rt))
+ err = PTR_ERR(rt);
+ }
+
+ if (err)
+ goto errout_free;
+
+ skb_dst_set(skb, &rt->dst);
+ if (rtm->rtm_flags & RTM_F_NOTIFY)
+ rt->rt_flags |= RTCF_NOTIFY;
+
+ err = rt_fill_info(net, dst, src, &fl4, skb,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ RTM_NEWROUTE, 0, 0);
+ if (err < 0)
+ goto errout_free;
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+
+errout_free:
+ kfree_skb(skb);
+ goto errout;
+}
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+ rt_cache_flush(dev_net(in_dev->dev));
+}
+
+#ifdef CONFIG_SYSCTL
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly = 8;
+
+static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net = (struct net *)__ctl->extra1;
+
+ if (write) {
+ rt_cache_flush(net);
+ fnhe_genid_bump(net);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static struct ctl_table ipv4_route_table[] = {
+ {
+ .procname = "gc_thresh",
+ .data = &ipv4_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_size",
+ .data = &ip_rt_max_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ /* Deprecated. Use gc_min_interval_ms */
+
+ .procname = "gc_min_interval",
+ .data = &ip_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_min_interval_ms",
+ .data = &ip_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "gc_timeout",
+ .data = &ip_rt_gc_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_interval",
+ .data = &ip_rt_gc_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "redirect_load",
+ .data = &ip_rt_redirect_load,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "redirect_number",
+ .data = &ip_rt_redirect_number,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "redirect_silence",
+ .data = &ip_rt_redirect_silence,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "error_cost",
+ .data = &ip_rt_error_cost,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "error_burst",
+ .data = &ip_rt_error_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "gc_elasticity",
+ .data = &ip_rt_gc_elasticity,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mtu_expires",
+ .data = &ip_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "min_pmtu",
+ .data = &ip_rt_min_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "min_adv_mss",
+ .data = &ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static struct ctl_table ipv4_route_flush_table[] = {
+ {
+ .procname = "flush",
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = ipv4_sysctl_rtcache_flush,
+ },
+ { },
+};
+
+static __net_init int sysctl_route_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = ipv4_route_flush_table;
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
+ if (!tbl)
+ goto err_dup;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
+ }
+ tbl[0].extra1 = net;
+
+ net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
+ if (!net->ipv4.route_hdr)
+ goto err_reg;
+ return 0;
+
+err_reg:
+ if (tbl != ipv4_route_flush_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_route_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.route_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.route_hdr);
+ BUG_ON(tbl == ipv4_route_flush_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_route_ops = {
+ .init = sysctl_route_net_init,
+ .exit = sysctl_route_net_exit,
+};
+#endif
+
+static __net_init int rt_genid_init(struct net *net)
+{
+ atomic_set(&net->ipv4.rt_genid, 0);
+ atomic_set(&net->fnhe_genid, 0);
+ get_random_bytes(&net->ipv4.dev_addr_genid,
+ sizeof(net->ipv4.dev_addr_genid));
+ return 0;
+}
+
+static __net_initdata struct pernet_operations rt_genid_ops = {
+ .init = rt_genid_init,
+};
+
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv4.peers = bp;
+ return 0;
+}
+
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
+{
+ struct inet_peer_base *bp = net->ipv4.peers;
+
+ net->ipv4.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
+}
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+ .init = ipv4_inetpeer_init,
+ .exit = ipv4_inetpeer_exit,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+#endif /* CONFIG_IP_ROUTE_CLASSID */
+
+int __init ip_rt_init(void)
+{
+ int rc = 0;
+ int cpu;
+
+ ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+ if (!ip_idents)
+ panic("IP: failed to allocate ip_idents\n");
+
+ prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+ INIT_LIST_HEAD(&ul->head);
+ spin_lock_init(&ul->lock);
+ }
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
+ if (!ip_rt_acct)
+ panic("IP: failed to allocate ip_rt_acct\n");
+#endif
+
+ ipv4_dst_ops.kmem_cachep =
+ kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+ ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+
+ if (dst_entries_init(&ipv4_dst_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+ if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
+ ipv4_dst_ops.gc_thresh = ~0;
+ ip_rt_max_size = INT_MAX;
+
+ devinet_init();
+ ip_fib_init();
+
+ if (ip_rt_proc_init())
+ pr_err("Unable to create route proc files\n");
+#ifdef CONFIG_XFRM
+ xfrm_init();
+ xfrm4_init();
+#endif
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
+
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&sysctl_route_ops);
+#endif
+ register_pernet_subsys(&rt_genid_ops);
+ register_pernet_subsys(&ipv4_inetpeer_ops);
+ return rc;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+ register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
+}
+#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 000000000..df849e5a1
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,401 @@
+/*
+ * Syncookies implementation for the Linux kernel
+ *
+ * Copyright (C) 1997 Andi Kleen
+ * Based on ideas by D.J.Bernstein and Eric Schenk.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+extern int sysctl_tcp_syncookies;
+
+static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+
+#define COOKIEBITS 24 /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK
+ * stores TCP options:
+ *
+ * MSB LSB
+ * | 31 ... 6 | 5 | 4 | 3 2 1 0 |
+ * | Timestamp | ECN | SACK | WScale |
+ *
+ * When we receive a valid cookie-ACK, we look at the echoed tsval (if
+ * any) to figure out which TCP options we should use for the rebuilt
+ * connection.
+ *
+ * A WScale setting of '0xf' (which is an invalid scaling value)
+ * means that original syn did not include the TCP window scaling option.
+ */
+#define TS_OPT_WSCALE_MASK 0xf
+#define TS_OPT_SACK BIT(4)
+#define TS_OPT_ECN BIT(5)
+/* There is no TS_OPT_TIMESTAMP:
+ * if ACK contains timestamp option, we already know it was
+ * requested/supported by the syn/synack exchange.
+ */
+#define TSBITS 6
+#define TSMASK (((__u32)1 << TSBITS) - 1)
+
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+ ipv4_cookie_scratch);
+
+static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
+ u32 count, int c)
+{
+ __u32 *tmp;
+
+ net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+
+ tmp = this_cpu_ptr(ipv4_cookie_scratch);
+ memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
+ tmp[0] = (__force u32)saddr;
+ tmp[1] = (__force u32)daddr;
+ tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
+ tmp[3] = count;
+ sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+ return tmp[17];
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we encode
+ * tcp options in the lower bits of the timestamp value that will be
+ * sent in the syn-ack.
+ * Since subsequent timestamps use the normal tcp_time_stamp value, we
+ * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
+ */
+__u32 cookie_init_timestamp(struct request_sock *req)
+{
+ struct inet_request_sock *ireq;
+ u32 ts, ts_now = tcp_time_stamp;
+ u32 options = 0;
+
+ ireq = inet_rsk(req);
+
+ options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
+ if (ireq->sack_ok)
+ options |= TS_OPT_SACK;
+ if (ireq->ecn_ok)
+ options |= TS_OPT_ECN;
+
+ ts = ts_now & ~TSMASK;
+ ts |= options;
+ if (ts > ts_now) {
+ ts >>= TSBITS;
+ ts--;
+ ts <<= TSBITS;
+ ts |= options;
+ }
+ return ts;
+}
+
+
+static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
+ __be16 dport, __u32 sseq, __u32 data)
+{
+ /*
+ * Compute the secure sequence number.
+ * The output should be:
+ * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+ * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+ * Where sseq is their sequence number and count increases every
+ * minute by 1.
+ * As an extra hack, we add a small "data" value that encodes the
+ * MSS into the second hash value.
+ */
+ u32 count = tcp_cookie_time();
+ return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+ sseq + (count << COOKIEBITS) +
+ ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+ & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range. This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, __u32 sseq)
+{
+ u32 diff, count = tcp_cookie_time();
+
+ /* Strip away the layers from the cookie */
+ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+ /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
+ return (__u32)-1;
+
+ return (cookie -
+ cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+ & COOKIEMASK; /* Leaving the data behind */
+}
+
+/*
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ * .. lower than 536 are rare (< 0.2%)
+ * .. between 537 and 1299 account for less than < 1.5% of observed values
+ * .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ * .. exceeding 1460 are very rare (< 0.04%)
+ *
+ * 1460 is the single most frequently announced mss value (30 to 46% depending
+ * on monitor location). Table must be sorted.
+ */
+static __u16 const msstab[] = {
+ 536,
+ 1300,
+ 1440, /* 1440, 1452: PPPoE */
+ 1460,
+};
+
+/*
+ * Generate a syncookie. mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ u16 *mssp)
+{
+ int mssind;
+ const __u16 mss = *mssp;
+
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+ *mssp = msstab[mssind];
+
+ return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
+ th->source, th->dest, ntohl(th->seq),
+ mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mssp)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ tcp_synq_overflow(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+ return __cookie_v4_init_sequence(iph, th, mssp);
+}
+
+/*
+ * Check if a ack sequence number is a valid syncookie.
+ * Return the decoded mss if it is, or 0 if not.
+ */
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+ u32 cookie)
+{
+ __u32 seq = ntohl(th->seq) - 1;
+ __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+ th->source, th->dest, seq);
+
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
+
+static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *child;
+
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ if (child) {
+ atomic_set(&req->rsk_refcnt, 1);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ } else {
+ reqsk_free(req);
+ }
+ return child;
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we stored
+ * additional tcp options in the timestamp.
+ * This extracts these options from the timestamp echo.
+ *
+ * return false if we decode a tcp option that is disabled
+ * on the host.
+ */
+bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
+{
+ /* echoed timestamp, lowest bits contain options */
+ u32 options = tcp_opt->rcv_tsecr;
+
+ if (!tcp_opt->saw_tstamp) {
+ tcp_clear_options(tcp_opt);
+ return true;
+ }
+
+ if (!sysctl_tcp_timestamps)
+ return false;
+
+ tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
+
+ if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+ return false;
+
+ if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
+ return true; /* no window scaling */
+
+ tcp_opt->wscale_ok = 1;
+ tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
+
+ return sysctl_tcp_window_scaling != 0;
+}
+EXPORT_SYMBOL(cookie_timestamp_decode);
+
+bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
+ const struct net *net, const struct dst_entry *dst)
+{
+ bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
+
+ if (!ecn_ok)
+ return false;
+
+ if (net->ipv4.sysctl_tcp_ecn)
+ return true;
+
+ return dst_feature(dst, RTAX_FEATURE_ECN);
+}
+EXPORT_SYMBOL(cookie_ecn_ok);
+
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
+ struct tcp_options_received tcp_opt;
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ __u32 cookie = ntohl(th->ack_seq) - 1;
+ struct sock *ret = sk;
+ struct request_sock *req;
+ int mss;
+ struct rtable *rt;
+ __u8 rcv_wscale;
+ struct flowi4 fl4;
+
+ if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+ goto out;
+
+ if (tcp_synq_no_recent_overflow(sk))
+ goto out;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
+ if (mss == 0) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ goto out;
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
+
+ if (!cookie_timestamp_decode(&tcp_opt))
+ goto out;
+
+ ret = NULL;
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ if (!req)
+ goto out;
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+ treq->rcv_isn = ntohl(th->seq) - 1;
+ treq->snt_isn = cookie;
+ req->mss = mss;
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_rmt_port = th->source;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+ ireq->snd_wscale = tcp_opt.snd_wscale;
+ ireq->sack_ok = tcp_opt.sack_ok;
+ ireq->wscale_ok = tcp_opt.wscale_ok;
+ ireq->tstamp_ok = tcp_opt.saw_tstamp;
+ req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+ treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->tfo_listener = false;
+
+ ireq->ir_iif = sk->sk_bound_dev_if;
+
+ /* We throwed the options of the initial SYN away, so we hope
+ * the ACK carries the same options again (see RFC1122 4.2.3.8)
+ */
+ ireq->opt = tcp_v4_save_options(skb);
+
+ if (security_inet_conn_request(sk, skb, req)) {
+ reqsk_free(req);
+ goto out;
+ }
+
+ req->num_retrans = 0;
+
+ /*
+ * We need to lookup the route here to get at the correct
+ * window size. We should better make sure that the window size
+ * hasn't changed since we received the original syn, but I see
+ * no easy way to do this.
+ */
+ flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+ inet_sk_flowi_flags(sk),
+ opt->srr ? opt->faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, th->source, th->dest);
+ security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(sock_net(sk), &fl4);
+ if (IS_ERR(rt)) {
+ reqsk_free(req);
+ goto out;
+ }
+
+ /* Try to redo what tcp_v4_send_synack did. */
+ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+
+ tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ ireq->wscale_ok, &rcv_wscale,
+ dst_metric(&rt->dst, RTAX_INITRWND));
+
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
+
+ ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ /* ip_queue_xmit() depends on our flow being setup
+ * Normal sockets get it right from inet_csk_route_child_sock()
+ */
+ if (ret)
+ inet_sk(ret)->cork.fl.u.ip4 = fl4;
+out: return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 000000000..c3852a7ff
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,970 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/seqlock.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/swap.h>
+#include <net/snmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/cipso_ipv4.h>
+#include <net/inet_frag.h>
+#include <net/ping.h>
+#include <net/tcp_memcontrol.h>
+
+static int zero;
+static int one = 1;
+static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
+static int tcp_retr1_max = 255;
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int tcp_syn_retries_min = 1;
+static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+
+/* Update system visible IP port range */
+static void set_local_port_range(struct net *net, int range[2])
+{
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = range[0];
+ net->ipv4.ip_local_ports.range[1] = range[1];
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_local_port_range(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ip_local_ports.range);
+ int ret;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &range,
+ .maxlen = sizeof(range),
+ .mode = table->mode,
+ .extra1 = &ip_local_port_range_min,
+ .extra2 = &ip_local_port_range_max,
+ };
+
+ inet_get_local_port_range(net, &range[0], &range[1]);
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (range[1] < range[0])
+ ret = -EINVAL;
+ else
+ set_local_port_range(net, range);
+ }
+
+ return ret;
+}
+
+
+static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ unsigned int seq;
+ do {
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ data[0] = low;
+ data[1] = high;
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int ret;
+ gid_t urange[2];
+ kgid_t low, high;
+ struct ctl_table tmp = {
+ .data = &urange,
+ .maxlen = sizeof(urange),
+ .mode = table->mode,
+ .extra1 = &ip_ping_group_range_min,
+ .extra2 = &ip_ping_group_range_max,
+ };
+
+ inet_get_ping_group_range_table(table, &low, &high);
+ urange[0] = from_kgid_munged(user_ns, low);
+ urange[1] = from_kgid_munged(user_ns, high);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ low = make_kgid(user_ns, urange[0]);
+ high = make_kgid(user_ns, urange[1]);
+ if (!gid_valid(low) || !gid_valid(high) ||
+ (urange[1] < urange[0]) || gid_lt(high, low)) {
+ low = make_kgid(&init_user_ns, 1);
+ high = make_kgid(&init_user_ns, 0);
+ }
+ set_ping_group_range(table, low, high);
+ }
+
+ return ret;
+}
+
+static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char val[TCP_CA_NAME_MAX];
+ struct ctl_table tbl = {
+ .data = val,
+ .maxlen = TCP_CA_NAME_MAX,
+ };
+ int ret;
+
+ tcp_get_default_congestion_control(val);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+}
+
+static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
+ int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+ tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+ return ret;
+}
+
+static int proc_allowed_congestion_control(struct ctl_table *ctl,
+ int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_allowed_congestion_control(tbl.data);
+ kfree(tbl.data);
+ return ret;
+}
+
+static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+ struct tcp_fastopen_context *ctxt;
+ int ret;
+ u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ ctxt = rcu_dereference(tcp_fastopen_ctx);
+ if (ctxt)
+ memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+ else
+ memset(user_key, 0, sizeof(user_key));
+ rcu_read_unlock();
+
+ snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
+ user_key[0], user_key[1], user_key[2], user_key[3]);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ /* Generate a dummy secret but don't publish it. This
+ * is needed so we don't regenerate a new key on the
+ * first invocation of tcp_fastopen_cookie_gen
+ */
+ tcp_fastopen_init_key_once(false);
+ tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+ }
+
+bad_key:
+ pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3],
+ (char *)tbl.data, ret);
+ kfree(tbl.data);
+ return ret;
+}
+
+static struct ctl_table ipv4_table[] = {
+ {
+ .procname = "tcp_timestamps",
+ .data = &sysctl_tcp_timestamps,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_window_scaling",
+ .data = &sysctl_tcp_window_scaling,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_sack",
+ .data = &sysctl_tcp_sack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_retrans_collapse",
+ .data = &sysctl_tcp_retrans_collapse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_default_ttl",
+ .data = &sysctl_ip_default_ttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_ttl_min,
+ .extra2 = &ip_ttl_max,
+ },
+ {
+ .procname = "tcp_syn_retries",
+ .data = &sysctl_tcp_syn_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
+ },
+ {
+ .procname = "tcp_synack_retries",
+ .data = &sysctl_tcp_synack_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_orphans",
+ .data = &sysctl_tcp_max_orphans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_tw_buckets",
+ .data = &tcp_death_row.sysctl_max_tw_buckets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_early_demux",
+ .data = &sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_dynaddr",
+ .data = &sysctl_ip_dynaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_keepalive_time",
+ .data = &sysctl_tcp_keepalive_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_keepalive_probes",
+ .data = &sysctl_tcp_keepalive_probes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_keepalive_intvl",
+ .data = &sysctl_tcp_keepalive_intvl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_retries1",
+ .data = &sysctl_tcp_retries1,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra2 = &tcp_retr1_max
+ },
+ {
+ .procname = "tcp_retries2",
+ .data = &sysctl_tcp_retries2,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fin_timeout",
+ .data = &sysctl_tcp_fin_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#ifdef CONFIG_SYN_COOKIES
+ {
+ .procname = "tcp_syncookies",
+ .data = &sysctl_tcp_syncookies,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "tcp_fastopen",
+ .data = &sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fastopen_key",
+ .mode = 0600,
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ .proc_handler = proc_tcp_fastopen_key,
+ },
+ {
+ .procname = "tcp_tw_recycle",
+ .data = &tcp_death_row.sysctl_tw_recycle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_abort_on_overflow",
+ .data = &sysctl_tcp_abort_on_overflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_stdurg",
+ .data = &sysctl_tcp_stdurg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_rfc1337",
+ .data = &sysctl_tcp_rfc1337,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_syn_backlog",
+ .data = &sysctl_max_syn_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_memberships",
+ .data = &sysctl_igmp_max_memberships,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_msf",
+ .data = &sysctl_igmp_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_IP_MULTICAST
+ {
+ .procname = "igmp_qrv",
+ .data = &sysctl_igmp_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+#endif
+ {
+ .procname = "inet_peer_threshold",
+ .data = &inet_peer_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "inet_peer_minttl",
+ .data = &inet_peer_minttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "inet_peer_maxttl",
+ .data = &inet_peer_maxttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_orphan_retries",
+ .data = &sysctl_tcp_orphan_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fack",
+ .data = &sysctl_tcp_fack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_reordering",
+ .data = &sysctl_tcp_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_reordering",
+ .data = &sysctl_tcp_max_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_dsack",
+ .data = &sysctl_tcp_dsack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_mem",
+ .maxlen = sizeof(sysctl_tcp_mem),
+ .data = &sysctl_tcp_mem,
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_wmem",
+ .data = &sysctl_tcp_wmem,
+ .maxlen = sizeof(sysctl_tcp_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_notsent_lowat",
+ .data = &sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(sysctl_tcp_notsent_lowat),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_rmem",
+ .data = &sysctl_tcp_rmem,
+ .maxlen = sizeof(sysctl_tcp_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_app_win",
+ .data = &sysctl_tcp_app_win,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_adv_win_scale",
+ .data = &sysctl_tcp_adv_win_scale,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_adv_win_scale_min,
+ .extra2 = &tcp_adv_win_scale_max,
+ },
+ {
+ .procname = "tcp_tw_reuse",
+ .data = &sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_frto",
+ .data = &sysctl_tcp_frto,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_low_latency",
+ .data = &sysctl_tcp_low_latency,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_no_metrics_save",
+ .data = &sysctl_tcp_nometrics_save,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_moderate_rcvbuf",
+ .data = &sysctl_tcp_moderate_rcvbuf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_tso_win_divisor",
+ .data = &sysctl_tcp_tso_win_divisor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_congestion_control",
+ .mode = 0644,
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = proc_tcp_congestion_control,
+ },
+ {
+ .procname = "tcp_workaround_signed_windows",
+ .data = &sysctl_tcp_workaround_signed_windows,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_limit_output_bytes",
+ .data = &sysctl_tcp_limit_output_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_challenge_ack_limit",
+ .data = &sysctl_tcp_challenge_ack_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_slow_start_after_idle",
+ .data = &sysctl_tcp_slow_start_after_idle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_NETLABEL
+ {
+ .procname = "cipso_cache_enable",
+ .data = &cipso_v4_cache_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cipso_cache_bucket_size",
+ .data = &cipso_v4_cache_bucketsize,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cipso_rbm_optfmt",
+ .data = &cipso_v4_rbm_optfmt,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cipso_rbm_strictvalid",
+ .data = &cipso_v4_rbm_strictvalid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NETLABEL */
+ {
+ .procname = "tcp_available_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_congestion_control,
+ },
+ {
+ .procname = "tcp_allowed_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0644,
+ .proc_handler = proc_allowed_congestion_control,
+ },
+ {
+ .procname = "tcp_thin_linear_timeouts",
+ .data = &sysctl_tcp_thin_linear_timeouts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_thin_dupack",
+ .data = &sysctl_tcp_thin_dupack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_early_retrans",
+ .data = &sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &four,
+ },
+ {
+ .procname = "tcp_min_tso_segs",
+ .data = &sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &gso_max_segs,
+ },
+ {
+ .procname = "tcp_autocorking",
+ .data = &sysctl_tcp_autocorking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "tcp_invalid_ratelimit",
+ .data = &sysctl_tcp_invalid_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "icmp_msgs_per_sec",
+ .data = &sysctl_icmp_msgs_per_sec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "icmp_msgs_burst",
+ .data = &sysctl_icmp_msgs_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+ {
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+ { }
+};
+
+static struct ctl_table ipv4_net_table[] = {
+ {
+ .procname = "icmp_echo_ignore_all",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_echo_ignore_broadcasts",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_ignore_bogus_error_responses",
+ .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_errors_use_inbound_ifaddr",
+ .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_ratelimit",
+ .data = &init_net.ipv4.sysctl_icmp_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "icmp_ratemask",
+ .data = &init_net.ipv4.sysctl_icmp_ratemask,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ping_group_range",
+ .data = &init_net.ipv4.ping_group_range.range,
+ .maxlen = sizeof(gid_t)*2,
+ .mode = 0644,
+ .proc_handler = ipv4_ping_group_range,
+ },
+ {
+ .procname = "tcp_ecn",
+ .data = &init_net.ipv4.sysctl_tcp_ecn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_local_port_range",
+ .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
+ .data = &init_net.ipv4.ip_local_ports.range,
+ .mode = 0644,
+ .proc_handler = ipv4_local_port_range,
+ },
+ {
+ .procname = "ip_local_reserved_ports",
+ .data = &init_net.ipv4.sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "ip_no_pmtu_disc",
+ .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_forward_use_pmtu",
+ .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fwmark_accept",
+ .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_mtu_probing",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probing,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_base_mss",
+ .data = &init_net.ipv4.sysctl_tcp_base_mss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static __net_init int ipv4_sysctl_init_net(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = ipv4_net_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+
+ table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
+ }
+
+ net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!net->ipv4.ipv4_hdr)
+ goto err_reg;
+
+ net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!net->ipv4.sysctl_local_reserved_ports)
+ goto err_ports;
+
+ return 0;
+
+err_ports:
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_sysctl_exit_net(struct net *net)
+{
+ struct ctl_table *table;
+
+ kfree(net->ipv4.sysctl_local_reserved_ports);
+ table = net->ipv4.ipv4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+ kfree(table);
+}
+
+static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
+ .init = ipv4_sysctl_init_net,
+ .exit = ipv4_sysctl_exit_net,
+};
+
+static __init int sysctl_ipv4_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (register_pernet_subsys(&ipv4_sysctl_ops)) {
+ unregister_net_sysctl_table(hdr);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+__initcall(sysctl_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 000000000..ca6faeb44
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,3225 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() calls
+ * Alan Cox : Set the ACK bit on a reset
+ * Alan Cox : Stopped it crashing if it closed while
+ * sk->inuse=1 and was trying to connect
+ * (tcp_err()).
+ * Alan Cox : All icmp error handling was broken
+ * pointers passed where wrong and the
+ * socket was looked up backwards. Nobody
+ * tested any icmp error code obviously.
+ * Alan Cox : tcp_err() now handled properly. It
+ * wakes people on errors. poll
+ * behaves and the icmp error race
+ * has gone by moving it into sock.c
+ * Alan Cox : tcp_send_reset() fixed to work for
+ * everything not just packets for
+ * unknown sockets.
+ * Alan Cox : tcp option processing.
+ * Alan Cox : Reset tweaked (still not 100%) [Had
+ * syn rule wrong]
+ * Herp Rosmanith : More reset fixes
+ * Alan Cox : No longer acks invalid rst frames.
+ * Acking any kind of RST is right out.
+ * Alan Cox : Sets an ignore me flag on an rst
+ * receive otherwise odd bits of prattle
+ * escape still
+ * Alan Cox : Fixed another acking RST frame bug.
+ * Should stop LAN workplace lockups.
+ * Alan Cox : Some tidyups using the new skb list
+ * facilities
+ * Alan Cox : sk->keepopen now seems to work
+ * Alan Cox : Pulls options out correctly on accepts
+ * Alan Cox : Fixed assorted sk->rqueue->next errors
+ * Alan Cox : PSH doesn't end a TCP read. Switched a
+ * bit to skb ops.
+ * Alan Cox : Tidied tcp_data to avoid a potential
+ * nasty.
+ * Alan Cox : Added some better commenting, as the
+ * tcp is hard to follow
+ * Alan Cox : Removed incorrect check for 20 * psh
+ * Michael O'Reilly : ack < copied bug fix.
+ * Johannes Stille : Misc tcp fixes (not all in yet).
+ * Alan Cox : FIN with no memory -> CRASH
+ * Alan Cox : Added socket option proto entries.
+ * Also added awareness of them to accept.
+ * Alan Cox : Added TCP options (SOL_TCP)
+ * Alan Cox : Switched wakeup calls to callbacks,
+ * so the kernel can layer network
+ * sockets.
+ * Alan Cox : Use ip_tos/ip_ttl settings.
+ * Alan Cox : Handle FIN (more) properly (we hope).
+ * Alan Cox : RST frames sent on unsynchronised
+ * state ack error.
+ * Alan Cox : Put in missing check for SYN bit.
+ * Alan Cox : Added tcp_select_window() aka NET2E
+ * window non shrink trick.
+ * Alan Cox : Added a couple of small NET2E timer
+ * fixes
+ * Charles Hedrick : TCP fixes
+ * Toomas Tamm : TCP window fixes
+ * Alan Cox : Small URG fix to rlogin ^C ack fight
+ * Charles Hedrick : Rewrote most of it to actually work
+ * Linus : Rewrote tcp_read() and URG handling
+ * completely
+ * Gerhard Koerting: Fixed some missing timer handling
+ * Matthew Dillon : Reworked TCP machine states as per RFC
+ * Gerhard Koerting: PC/TCP workarounds
+ * Adam Caldwell : Assorted timer/timing errors
+ * Matthew Dillon : Fixed another RST bug
+ * Alan Cox : Move to kernel side addressing changes.
+ * Alan Cox : Beginning work on TCP fastpathing
+ * (not yet usable)
+ * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
+ * Alan Cox : TCP fast path debugging
+ * Alan Cox : Window clamping
+ * Michael Riepe : Bug in tcp_check()
+ * Matt Dillon : More TCP improvements and RST bug fixes
+ * Matt Dillon : Yet more small nasties remove from the
+ * TCP code (Be very nice to this man if
+ * tcp finally works 100%) 8)
+ * Alan Cox : BSD accept semantics.
+ * Alan Cox : Reset on closedown bug.
+ * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
+ * Michael Pall : Handle poll() after URG properly in
+ * all cases.
+ * Michael Pall : Undo the last fix in tcp_read_urg()
+ * (multi URG PUSH broke rlogin).
+ * Michael Pall : Fix the multi URG PUSH problem in
+ * tcp_readable(), poll() after URG
+ * works now.
+ * Michael Pall : recv(...,MSG_OOB) never blocks in the
+ * BSD api.
+ * Alan Cox : Changed the semantics of sk->socket to
+ * fix a race and a signal problem with
+ * accept() and async I/O.
+ * Alan Cox : Relaxed the rules on tcp_sendto().
+ * Yury Shevchuk : Really fixed accept() blocking problem.
+ * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
+ * clients/servers which listen in on
+ * fixed ports.
+ * Alan Cox : Cleaned the above up and shrank it to
+ * a sensible code size.
+ * Alan Cox : Self connect lockup fix.
+ * Alan Cox : No connect to multicast.
+ * Ross Biro : Close unaccepted children on master
+ * socket close.
+ * Alan Cox : Reset tracing code.
+ * Alan Cox : Spurious resets on shutdown.
+ * Alan Cox : Giant 15 minute/60 second timer error
+ * Alan Cox : Small whoops in polling before an
+ * accept.
+ * Alan Cox : Kept the state trace facility since
+ * it's handy for debugging.
+ * Alan Cox : More reset handler fixes.
+ * Alan Cox : Started rewriting the code based on
+ * the RFC's for other useful protocol
+ * references see: Comer, KA9Q NOS, and
+ * for a reference on the difference
+ * between specifications and how BSD
+ * works see the 4.4lite source.
+ * A.N.Kuznetsov : Don't time wait on completion of tidy
+ * close.
+ * Linus Torvalds : Fin/Shutdown & copied_seq changes.
+ * Linus Torvalds : Fixed BSD port reuse to work first syn
+ * Alan Cox : Reimplemented timers as per the RFC
+ * and using multiple timers for sanity.
+ * Alan Cox : Small bug fixes, and a lot of new
+ * comments.
+ * Alan Cox : Fixed dual reader crash by locking
+ * the buffers (much like datagram.c)
+ * Alan Cox : Fixed stuck sockets in probe. A probe
+ * now gets fed up of retrying without
+ * (even a no space) answer.
+ * Alan Cox : Extracted closing code better
+ * Alan Cox : Fixed the closing state machine to
+ * resemble the RFC.
+ * Alan Cox : More 'per spec' fixes.
+ * Jorge Cwik : Even faster checksumming.
+ * Alan Cox : tcp_data() doesn't ack illegal PSH
+ * only frames. At least one pc tcp stack
+ * generates them.
+ * Alan Cox : Cache last socket.
+ * Alan Cox : Per route irtt.
+ * Matt Day : poll()->select() match BSD precisely on error
+ * Alan Cox : New buffers
+ * Marc Tamsky : Various sk->prot->retransmits and
+ * sk->retransmits misupdating fixed.
+ * Fixed tcp_write_timeout: stuck close,
+ * and TCP syn retries gets used now.
+ * Mark Yarvis : In tcp_read_wakeup(), don't send an
+ * ack if state is TCP_CLOSED.
+ * Alan Cox : Look up device on a retransmit - routes may
+ * change. Doesn't yet cope with MSS shrink right
+ * but it's a start!
+ * Marc Tamsky : Closing in closing fixes.
+ * Mike Shaver : RFC1122 verifications.
+ * Alan Cox : rcv_saddr errors.
+ * Alan Cox : Block double connect().
+ * Alan Cox : Small hooks for enSKIP.
+ * Alexey Kuznetsov: Path MTU discovery.
+ * Alan Cox : Support soft errors.
+ * Alan Cox : Fix MTU discovery pathological case
+ * when the remote claims no mtu!
+ * Marc Tamsky : TCP_CLOSE fix.
+ * Colin (G3TNE) : Send a reset on syn ack replies in
+ * window but wrong (fixes NT lpd problems)
+ * Pedro Roque : Better TCP window handling, delayed ack.
+ * Joerg Reuter : No modification of locked buffers in
+ * tcp_do_retransmit()
+ * Eric Schenk : Changed receiver side silly window
+ * avoidance algorithm to BSD style
+ * algorithm. This doubles throughput
+ * against machines running Solaris,
+ * and seems to result in general
+ * improvement.
+ * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * Keith Owens : Do proper merging with partial SKB's in
+ * tcp_do_sendmsg to avoid burstiness.
+ * Eric Schenk : Fix fast close down bug with
+ * shutdown() followed by close().
+ * Andi Kleen : Make poll agree with SIGIO
+ * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
+ * lingertime == 0 (RFC 793 ABORT Call)
+ * Hirokazu Takahashi : Use copy_from_user() instead of
+ * csum_and_copy_from_user() if possible.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ * TCP_SYN_SENT sent a connection request, waiting for ack
+ *
+ * TCP_SYN_RECV received a connection request, sent ack,
+ * waiting for final ack in three-way handshake.
+ *
+ * TCP_ESTABLISHED connection established
+ *
+ * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
+ * transmission of remaining buffered data
+ *
+ * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
+ * to shutdown
+ *
+ * TCP_CLOSING both sides have shutdown but we still have
+ * data we have to finish sending
+ *
+ * TCP_TIME_WAIT timeout to catch resent junk before entering
+ * closed, can only be entered from FIN_WAIT2
+ * or CLOSING. Required because the other end
+ * may not have gotten our last ACK causing it
+ * to retransmit the data packet (which we ignore)
+ *
+ * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
+ * us to finish writing our data and to shutdown
+ * (we have to close() to move on to LAST_ACK)
+ *
+ * TCP_LAST_ACK out side has shutdown after remote has
+ * shutdown. There may still be data in our
+ * buffer that we have to finish sending
+ *
+ * TCP_CLOSE socket is finished
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/inet_diag.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/random.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <net/busy_poll.h>
+
+int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
+int sysctl_tcp_autocorking __read_mostly = 1;
+
+struct percpu_counter tcp_orphan_count;
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
+long sysctl_tcp_mem[3] __read_mostly;
+int sysctl_tcp_wmem[3] __read_mostly;
+int sysctl_tcp_rmem[3] __read_mostly;
+
+EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_SYMBOL(sysctl_tcp_rmem);
+EXPORT_SYMBOL(sysctl_tcp_wmem);
+
+atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
+EXPORT_SYMBOL(tcp_memory_allocated);
+
+/*
+ * Current number of TCP sockets.
+ */
+struct percpu_counter tcp_sockets_allocated;
+EXPORT_SYMBOL(tcp_sockets_allocated);
+
+/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+ struct pipe_inode_info *pipe;
+ size_t len;
+ unsigned int flags;
+};
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the __sk_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+int tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+ if (!tcp_memory_pressure) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+ tcp_memory_pressure = 1;
+ }
+}
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+ u8 res = 0;
+
+ if (seconds > 0) {
+ int period = timeout;
+
+ res = 1;
+ while (seconds > period && res < 255) {
+ res++;
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+ int period = 0;
+
+ if (retrans > 0) {
+ period = timeout;
+ while (--retrans) {
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return period;
+}
+
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ __skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
+ INIT_LIST_HEAD(&tp->tsq_node);
+
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = TCP_INIT_CWND;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = TCP_MSS_DEFAULT;
+ u64_stats_init(&tp->syncp);
+
+ tp->reordering = sysctl_tcp_reordering;
+ tcp_enable_early_retrans(tp);
+ tcp_assign_congestion_control(sk);
+
+ tp->tsoffset = 0;
+
+ sk->sk_state = TCP_CLOSE;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ icsk->icsk_sync_mss = tcp_sync_mss;
+
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+ local_bh_disable();
+ sock_update_memcg(sk);
+ sk_sockets_allocated_inc(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(tcp_init_sock);
+
+static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ if (sk->sk_tsflags) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ sock_tx_timestamp(sk, &shinfo->tx_flags);
+ if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+ }
+}
+
+/*
+ * Wait for a TCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ sock_rps_record_flow(sk);
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ if (sk->sk_state == TCP_LISTEN)
+ return inet_csk_listen_poll(sk);
+
+ /* Socket is not locked. We are protected from async events
+ * by poll logic and correct handling of state changes
+ * made by other threads is impossible in any case.
+ */
+
+ mask = 0;
+
+ /*
+ * POLLHUP is certainly not done right. But poll() doesn't
+ * have a notion of HUP in just one direction, and for a
+ * socket the read side is more interesting.
+ *
+ * Some poll() documentation says that POLLHUP is incompatible
+ * with the POLLOUT/POLLWR flags, so somebody should check this
+ * all. But careful, it tends to be safer to return too many
+ * bits than too few, and you can easily break real applications
+ * if you don't tell them that something has hung up!
+ *
+ * Check-me.
+ *
+ * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+ * our fs/select.c). It means that after we received EOF,
+ * poll always returns immediately, making impossible poll() on write()
+ * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+ * if and only if shutdown has been made in both directions.
+ * Actually, it is interesting to look how Solaris and DUX
+ * solve this dilemma. I would prefer, if POLLHUP were maskable,
+ * then we could set it on SND_SHUTDOWN. BTW examples given
+ * in Stevens' books assume exactly this behaviour, it explains
+ * why POLLHUP is incompatible with POLLOUT. --ANK
+ *
+ * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+ * blocking on fresh not-connected or disconnected socket. --ANK
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+
+ /* Connected or passive Fast Open socket? */
+ if (sk->sk_state != TCP_SYN_SENT &&
+ (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ int target = sock_rcvlowat(sk, 0, INT_MAX);
+
+ if (tp->urg_seq == tp->copied_seq &&
+ !sock_flag(sk, SOCK_URGINLINE) &&
+ tp->urg_data)
+ target++;
+
+ /* Potential race condition. If read of tp below will
+ * escape above sk->sk_state, we can be illegally awaken
+ * in SYN_* states. */
+ if (tp->rcv_nxt - tp->copied_seq >= target)
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_is_writeable(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ set_bit(SOCK_ASYNC_NOSPACE,
+ &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* Race breaker. If space is freed after
+ * wspace test but before the flags are set,
+ * IO signal will be lost. Memory barrier
+ * pairs with the input side.
+ */
+ smp_mb__after_atomic();
+ if (sk_stream_is_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ } else
+ mask |= POLLOUT | POLLWRNORM;
+
+ if (tp->urg_data & TCP_URG_VALID)
+ mask |= POLLPRI;
+ }
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ smp_rmb();
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+
+ return mask;
+}
+EXPORT_SYMBOL(tcp_poll);
+
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int answ;
+ bool slow;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ slow = lock_sock_fast(sk);
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else if (sock_flag(sk, SOCK_URGINLINE) ||
+ !tp->urg_data ||
+ before(tp->urg_seq, tp->copied_seq) ||
+ !before(tp->urg_seq, tp->rcv_nxt)) {
+
+ answ = tp->rcv_nxt - tp->copied_seq;
+
+ /* Subtract 1, if FIN was received */
+ if (answ && sock_flag(sk, SOCK_DONE))
+ answ--;
+ } else
+ answ = tp->urg_seq - tp->copied_seq;
+ unlock_sock_fast(sk, slow);
+ break;
+ case SIOCATMARK:
+ answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ break;
+ case SIOCOUTQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_una;
+ break;
+ case SIOCOUTQNSD:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_nxt;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(answ, (int __user *)arg);
+}
+EXPORT_SYMBOL(tcp_ioctl);
+
+static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ tp->pushed_seq = tp->write_seq;
+}
+
+static inline bool forced_push(const struct tcp_sock *tp)
+{
+ return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+
+static void skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ skb->csum = 0;
+ tcb->seq = tcb->end_seq = tp->write_seq;
+ tcb->tcp_flags = TCPHDR_ACK;
+ tcb->sacked = 0;
+ __skb_header_release(skb);
+ tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ if (tp->nonagle & TCP_NAGLE_PUSH)
+ tp->nonagle &= ~TCP_NAGLE_PUSH;
+}
+
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
+{
+ if (flags & MSG_OOB)
+ tp->snd_up = tp->write_seq;
+}
+
+/* If a not yet filled skb is pushed, do not send it if
+ * we have data packets in Qdisc or NIC queues :
+ * Because TX completion will happen shortly, it gives a chance
+ * to coalesce future sendmsg() payload into this skb, without
+ * need for a timer, and with no latency trade off.
+ * As packets containing data payload have a bigger truesize
+ * than pure acks (dataless) packets, the last checks prevent
+ * autocorking if we only have an ACK in Qdisc/NIC queues,
+ * or if TX completion was delayed after we processed ACK packet.
+ */
+static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
+ int size_goal)
+{
+ return skb->len < size_goal &&
+ sysctl_tcp_autocorking &&
+ skb != tcp_write_queue_head(sk) &&
+ atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
+}
+
+static void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (!tcp_send_head(sk))
+ return;
+
+ skb = tcp_write_queue_tail(sk);
+ if (!(flags & MSG_MORE) || forced_push(tp))
+ tcp_mark_push(tp, skb);
+
+ tcp_mark_urg(tp, flags);
+
+ if (tcp_should_autocork(sk, skb, size_goal)) {
+
+ /* avoid atomic op if TSQ_THROTTLED bit is already set */
+ if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ }
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
+ return;
+ }
+
+ if (flags & MSG_MORE)
+ nonagle = TCP_NAGLE_CORK;
+
+ __tcp_push_pending_frames(sk, mss_now, nonagle);
+}
+
+static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct tcp_splice_state *tss = rd_desc->arg.data;
+ int ret;
+
+ ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
+ tss->flags);
+ if (ret > 0)
+ rd_desc->count -= ret;
+ return ret;
+}
+
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ .count = tss->len,
+ };
+
+ return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ * tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ long timeo;
+ ssize_t spliced;
+ int ret;
+
+ sock_rps_record_flow(sk);
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ ret = spliced = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+ while (tss.len) {
+ ret = __tcp_splice_read(sk, &tss);
+ if (ret < 0)
+ break;
+ else if (!ret) {
+ if (spliced)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ if (!sock_flag(sk, SOCK_DONE))
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ sk_wait_data(sk, &timeo);
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ tss.len -= ret;
+ spliced += ret;
+
+ if (!timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+EXPORT_SYMBOL(tcp_splice_read);
+
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ /* The TCP header must be at least 32-bit aligned. */
+ size = ALIGN(size, 4);
+
+ skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ if (skb) {
+ if (sk_wmem_schedule(sk, skb->truesize)) {
+ skb_reserve(skb, sk->sk_prot->max_header);
+ /*
+ * Make sure that we have exactly size bytes
+ * available to the caller, no more, no less.
+ */
+ skb->reserved_tailroom = skb->end - skb->tail - size;
+ return skb;
+ }
+ __kfree_skb(skb);
+ } else {
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ }
+ return NULL;
+}
+
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+ int large_allowed)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 new_size_goal, size_goal;
+
+ if (!large_allowed || !sk_can_gso(sk))
+ return mss_now;
+
+ /* Note : tcp_tso_autosize() will eventually split this later */
+ new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
+ new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+
+ /* We try hard to avoid divides here */
+ size_goal = tp->gso_segs * mss_now;
+ if (unlikely(new_size_goal < size_goal ||
+ new_size_goal >= size_goal + mss_now)) {
+ tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+ sk->sk_gso_max_segs);
+ size_goal = tp->gso_segs * mss_now;
+ }
+
+ return max(size_goal, mss_now);
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+ int mss_now;
+
+ mss_now = tcp_current_mss(sk);
+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+ return mss_now;
+}
+
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+ int err;
+ ssize_t copied;
+ long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_err;
+ }
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto out_err;
+
+ while (size > 0) {
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+ int copy, i;
+ bool can_coalesce;
+
+ if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ skb_entail(sk, skb);
+ copy = size_goal;
+ }
+
+ if (copy > size)
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ if (can_coalesce) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk->sk_wmem_queued += copy;
+ sk_mem_charge(sk, copy);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ if (!copied)
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+ copied += copy;
+ offset += copy;
+ if (!(size -= copy)) {
+ tcp_tx_timestamp(sk, skb);
+ goto out;
+ }
+
+ if (skb->len < size_goal || (flags & MSG_OOB))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
+
+ if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ goto do_error;
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+ }
+
+out:
+ if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+out_err:
+ return sk_stream_error(sk, flags, err);
+}
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ ssize_t res;
+
+ if (!(sk->sk_route_caps & NETIF_F_SG) ||
+ !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
+ return sock_no_sendpage(sk->sk_socket, page, offset, size,
+ flags);
+
+ lock_sock(sk);
+ res = do_tcp_sendpages(sk, page, offset, size, flags);
+ release_sock(sk);
+ return res;
+}
+EXPORT_SYMBOL(tcp_sendpage);
+
+static inline int select_size(const struct sock *sk, bool sg)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int tmp = tp->mss_cache;
+
+ if (sg) {
+ if (sk_can_gso(sk)) {
+ /* Small frames wont use a full page:
+ * Payload will immediately follow tcp header.
+ */
+ tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ } else {
+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+ if (tmp >= pgbreak &&
+ tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+ tmp = pgbreak;
+ }
+ }
+
+ return tmp;
+}
+
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+ if (tp->fastopen_req) {
+ kfree(tp->fastopen_req);
+ tp->fastopen_req = NULL;
+ }
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
+ int *copied, size_t size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err, flags;
+
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ return -EOPNOTSUPP;
+ if (tp->fastopen_req)
+ return -EALREADY; /* Another Fast Open is in progress */
+
+ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+ sk->sk_allocation);
+ if (unlikely(!tp->fastopen_req))
+ return -ENOBUFS;
+ tp->fastopen_req->data = msg;
+ tp->fastopen_req->size = size;
+
+ flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+ err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, flags);
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ return err;
+}
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int flags, err, copied = 0;
+ int mss_now = 0, size_goal, copied_syn = 0;
+ bool sg;
+ long timeo;
+
+ lock_sock(sk);
+
+ flags = msg->msg_flags;
+ if (flags & MSG_FASTOPEN) {
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
+ if (err == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (err)
+ goto out_err;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto do_error;
+ }
+
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+ goto out_nopush;
+ }
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out_err;
+
+ /* 'common' sending to sendq */
+ }
+
+ /* This should be in poll */
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+
+ /* Ok commence sending. */
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto out_err;
+
+ sg = !!(sk->sk_route_caps & NETIF_F_SG);
+
+ while (msg_data_left(msg)) {
+ int copy = 0;
+ int max = size_goal;
+
+ skb = tcp_write_queue_tail(sk);
+ if (tcp_send_head(sk)) {
+ if (skb->ip_summed == CHECKSUM_NONE)
+ max = mss_now;
+ copy = max - skb->len;
+ }
+
+ if (copy <= 0) {
+new_segment:
+ /* Allocate new segment. If the interface is SG,
+ * allocate skb fitting to single page.
+ */
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_skb(sk,
+ select_size(sk, sg),
+ sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ /*
+ * Check whether we can use HW checksum.
+ */
+ if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_entail(sk, skb);
+ copy = size_goal;
+ max = size_goal;
+
+ /* All packets are restored as if they have
+ * already been sent. skb_mstamp isn't set to
+ * avoid wrong rtt estimation.
+ */
+ if (tp->repair)
+ TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
+ }
+
+ /* Try to append data to the end of skb. */
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
+
+ /* Where to copy to? */
+ if (skb_availroom(skb) > 0) {
+ /* We have some space in skb head. Superb! */
+ copy = min_t(int, copy, skb_availroom(skb));
+ err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
+ if (err)
+ goto do_fault;
+ } else {
+ bool merge = true;
+ int i = skb_shinfo(skb)->nr_frags;
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i == MAX_SKB_FRAGS || !sg) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ merge = false;
+ }
+
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto do_error;
+
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
+ }
+ pfrag->offset += copy;
+ }
+
+ if (!copied)
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ copied += copy;
+ if (!msg_data_left(msg)) {
+ tcp_tx_timestamp(sk, skb);
+ goto out;
+ }
+
+ if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ if (copied)
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
+
+ if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ goto do_error;
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+ }
+
+out:
+ if (copied)
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+out_nopush:
+ release_sock(sk);
+ return copied + copied_syn;
+
+do_fault:
+ if (!skb->len) {
+ tcp_unlink_write_queue(skb, sk);
+ /* It is the one place in all of TCP, except connection
+ * reset, where we can be unlinking the send_head.
+ */
+ tcp_check_send_head(sk, skb);
+ sk_wmem_free_skb(sk, skb);
+ }
+
+do_error:
+ if (copied + copied_syn)
+ goto out;
+out_err:
+ err = sk_stream_error(sk, flags, err);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(tcp_sendmsg);
+
+/*
+ * Handle reading urgent data. BSD has very simple semantics for
+ * this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* No URG data to read. */
+ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+ tp->urg_data == TCP_URG_READ)
+ return -EINVAL; /* Yes this is right ! */
+
+ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+ return -ENOTCONN;
+
+ if (tp->urg_data & TCP_URG_VALID) {
+ int err = 0;
+ char c = tp->urg_data;
+
+ if (!(flags & MSG_PEEK))
+ tp->urg_data = TCP_URG_READ;
+
+ /* Read urgent data. */
+ msg->msg_flags |= MSG_OOB;
+
+ if (len > 0) {
+ if (!(flags & MSG_TRUNC))
+ err = memcpy_to_msg(msg, &c, 1);
+ len = 1;
+ } else
+ msg->msg_flags |= MSG_TRUNC;
+
+ return err ? -EFAULT : len;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+ return 0;
+
+ /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
+ * the available implementations agree in this case:
+ * this call should never block, independent of the
+ * blocking state of the socket.
+ * Mike <pall@rz.uni-karlsruhe.de>
+ */
+ return -EAGAIN;
+}
+
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0;
+
+ /* XXX -- need to support SO_PEEK_OFF */
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ break;
+
+ copied += skb->len;
+ }
+
+ return err ?: copied;
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary. COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool time_to_ack = false;
+
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+
+ if (inet_csk_ack_scheduled(sk)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ /* Delayed ACKs frequently hit locked sockets during bulk
+ * receive. */
+ if (icsk->icsk_ack.blocked ||
+ /* Once-per-two-segments ACK was not sent by tcp_input.c */
+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+ /*
+ * If this read emptied read buffer, we send ACK, if
+ * connection is not bidirectional, user drained
+ * receive buffer and there was a small segment
+ * in queue.
+ */
+ (copied > 0 &&
+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+ !icsk->icsk_ack.pingpong)) &&
+ !atomic_read(&sk->sk_rmem_alloc)))
+ time_to_ack = true;
+ }
+
+ /* We send an ACK if we can now advertise a non-zero window
+ * which has been raised "significantly".
+ *
+ * Even if window raised up to infinity, do not send window open ACK
+ * in states, where we will not receive more. It is useless.
+ */
+ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ __u32 rcv_window_now = tcp_receive_window(tp);
+
+ /* Optimize, __tcp_select_window() is not cheap. */
+ if (2*rcv_window_now <= tp->window_clamp) {
+ __u32 new_window = __tcp_select_window(sk);
+
+ /* Send ACK now, if this read freed lots of space
+ * in our buffer. Certainly, new_window is new window.
+ * We can advertise it now, if it is not less than current one.
+ * "Lots" means "at least twice" here.
+ */
+ if (new_window && new_window >= 2 * rcv_window_now)
+ time_to_ack = true;
+ }
+ }
+ if (time_to_ack)
+ tcp_send_ack(sk);
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
+
+ /* RX process wants to run with disabled BHs, though it is not
+ * necessary */
+ local_bh_disable();
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk_backlog_rcv(sk, skb);
+ local_bh_enable();
+
+ /* Clear memory counter. */
+ tp->ucopy.memory = 0;
+}
+
+static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+ struct sk_buff *skb;
+ u32 offset;
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ offset--;
+ if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
+ *off = offset;
+ return skb;
+ }
+ /* This looks weird, but this can happen if TCP collapsing
+ * splitted a fat GRO packet, while we released socket lock
+ * in skb_splice_bits()
+ */
+ sk_eat_skb(sk, skb);
+ }
+ return NULL;
+}
+
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ * - The routine does not block.
+ * - At present, there is no support for reading OOB data
+ * or for 'peeking' the socket using this routine
+ * (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ u32 offset;
+ int copied = 0;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ if (offset < skb->len) {
+ int used;
+ size_t len;
+
+ len = skb->len - offset;
+ /* Stop reading if we hit a patch of urgent data */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - seq;
+ if (urg_offset < len)
+ len = urg_offset;
+ if (!len)
+ break;
+ }
+ used = recv_actor(desc, skb, offset, len);
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ break;
+ } else if (used <= len) {
+ seq += used;
+ copied += used;
+ offset += used;
+ }
+ /* If recv_actor drops the lock (e.g. TCP splice
+ * receive) the skb pointer might be invalid when
+ * getting here: tcp_collapse might have deleted it
+ * while aggregating skbs from the socket queue.
+ */
+ skb = tcp_recv_skb(sk, seq - 1, &offset);
+ if (!skb)
+ break;
+ /* TCP coalescing might have appended data to the skb.
+ * Try to splice more frags
+ */
+ if (offset + 1 != skb->len)
+ continue;
+ }
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ sk_eat_skb(sk, skb);
+ ++seq;
+ break;
+ }
+ sk_eat_skb(sk, skb);
+ if (!desc->count)
+ break;
+ tp->copied_seq = seq;
+ }
+ tp->copied_seq = seq;
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (copied > 0) {
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, copied);
+ }
+ return copied;
+}
+EXPORT_SYMBOL(tcp_read_sock);
+
+/*
+ * This routine copies from a sock struct into the user buffer.
+ *
+ * Technical note: in 2.3 we work on _locked_ socket, so that
+ * tricks with *seq access order and skb->users are not required.
+ * Probably, code can be easily improved even more.
+ */
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int copied = 0;
+ u32 peek_seq;
+ u32 *seq;
+ unsigned long used;
+ int err;
+ int target; /* Read at least this many bytes */
+ long timeo;
+ struct task_struct *user_recv = NULL;
+ struct sk_buff *skb;
+ u32 urg_hole = 0;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+ (sk->sk_state == TCP_ESTABLISHED))
+ sk_busy_loop(sk, nonblock);
+
+ lock_sock(sk);
+
+ err = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ /* Urgent data needs to be handled specially. */
+ if (flags & MSG_OOB)
+ goto recv_urg;
+
+ if (unlikely(tp->repair)) {
+ err = -EPERM;
+ if (!(flags & MSG_PEEK))
+ goto out;
+
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ goto recv_sndq;
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out;
+
+ /* 'common' recv queue MSG_PEEK-ing */
+ }
+
+ seq = &tp->copied_seq;
+ if (flags & MSG_PEEK) {
+ peek_seq = tp->copied_seq;
+ seq = &peek_seq;
+ }
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ do {
+ u32 offset;
+
+ /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+ if (tp->urg_data && tp->urg_seq == *seq) {
+ if (copied)
+ break;
+ if (signal_pending(current)) {
+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+ break;
+ }
+ }
+
+ /* Next get a buffer. */
+
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
+ /* Now that we have two receive queues this
+ * shouldn't happen.
+ */
+ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+ "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+ flags))
+ break;
+
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ offset--;
+ if (offset < skb->len)
+ goto found_ok_skb;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ goto found_fin_ok;
+ WARN(!(flags & MSG_PEEK),
+ "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+ }
+
+ /* Well, if we have backlog, try to process it now yet. */
+
+ if (copied >= target && !sk->sk_backlog.tail)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ tcp_cleanup_rbuf(sk, copied);
+
+ if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
+ /* Install new reader */
+ if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
+ user_recv = current;
+ tp->ucopy.task = user_recv;
+ tp->ucopy.msg = msg;
+ }
+
+ tp->ucopy.len = len;
+
+ WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+ !(flags & (MSG_PEEK | MSG_TRUNC)));
+
+ /* Ugly... If prequeue is not empty, we have to
+ * process it before releasing socket, otherwise
+ * order will be broken at second iteration.
+ * More elegant solution is required!!!
+ *
+ * Look: we have the following (pseudo)queues:
+ *
+ * 1. packets in flight
+ * 2. backlog
+ * 3. prequeue
+ * 4. receive_queue
+ *
+ * Each queue can be processed only if the next ones
+ * are empty. At this point we have empty receive_queue.
+ * But prequeue _can_ be not empty after 2nd iteration,
+ * when we jumped to start of loop because backlog
+ * processing added something to receive_queue.
+ * We cannot release_sock(), because backlog contains
+ * packets arrived _after_ prequeued ones.
+ *
+ * Shortly, algorithm is clear --- to process all
+ * the queues in order. We could make it more directly,
+ * requeueing packets from backlog to prequeue, if
+ * is not empty. It is more elegant, but eats cycles,
+ * unfortunately.
+ */
+ if (!skb_queue_empty(&tp->ucopy.prequeue))
+ goto do_prequeue;
+
+ /* __ Set realtime policy in scheduler __ */
+ }
+
+ if (copied >= target) {
+ /* Do not sleep, just process backlog. */
+ release_sock(sk);
+ lock_sock(sk);
+ } else
+ sk_wait_data(sk, &timeo);
+
+ if (user_recv) {
+ int chunk;
+
+ /* __ Restore normal policy in scheduler __ */
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+
+ if (tp->rcv_nxt == tp->copied_seq &&
+ !skb_queue_empty(&tp->ucopy.prequeue)) {
+do_prequeue:
+ tcp_prequeue_process(sk);
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+ }
+ if ((flags & MSG_PEEK) &&
+ (peek_seq - copied - urg_hole != tp->copied_seq)) {
+ net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
+ peek_seq = tp->copied_seq;
+ }
+ continue;
+
+ found_ok_skb:
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ /* Do we have urgent data here? */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - *seq;
+ if (urg_offset < used) {
+ if (!urg_offset) {
+ if (!sock_flag(sk, SOCK_URGINLINE)) {
+ ++*seq;
+ urg_hole++;
+ offset++;
+ used--;
+ if (!used)
+ goto skip_copy;
+ }
+ } else
+ used = urg_offset;
+ }
+ }
+
+ if (!(flags & MSG_TRUNC)) {
+ err = skb_copy_datagram_msg(skb, offset, msg, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ }
+
+ *seq += used;
+ copied += used;
+ len -= used;
+
+ tcp_rcv_space_adjust(sk);
+
+skip_copy:
+ if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+ tp->urg_data = 0;
+ tcp_fast_path_check(sk);
+ }
+ if (used + offset < skb->len)
+ continue;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ goto found_fin_ok;
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ continue;
+
+ found_fin_ok:
+ /* Process the FIN. */
+ ++*seq;
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ break;
+ } while (len > 0);
+
+ if (user_recv) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
+ int chunk;
+
+ tp->ucopy.len = copied > 0 ? len : 0;
+
+ tcp_prequeue_process(sk);
+
+ if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+
+ tp->ucopy.task = NULL;
+ tp->ucopy.len = 0;
+ }
+
+ /* According to UNIX98, msg_name/msg_namelen are ignored
+ * on connected socket. I was just happy when found this 8) --ANK
+ */
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_cleanup_rbuf(sk, copied);
+
+ release_sock(sk);
+ return copied;
+
+out:
+ release_sock(sk);
+ return err;
+
+recv_urg:
+ err = tcp_recv_urg(sk, msg, len, flags);
+ goto out;
+
+recv_sndq:
+ err = tcp_peek_sndq(sk, msg, len);
+ goto out;
+}
+EXPORT_SYMBOL(tcp_recvmsg);
+
+void tcp_set_state(struct sock *sk, int state)
+{
+ int oldstate = sk->sk_state;
+
+ switch (state) {
+ case TCP_ESTABLISHED:
+ if (oldstate != TCP_ESTABLISHED)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ break;
+
+ case TCP_CLOSE:
+ if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(sk);
+ /* fall through */
+ default:
+ if (oldstate == TCP_ESTABLISHED)
+ TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+
+#ifdef STATE_TRACE
+ SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_set_state);
+
+/*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
+ * states. A shutdown() may have already sent the FIN, or we may be
+ * closed.
+ */
+
+static const unsigned char new_state[16] = {
+ /* current state: new state: action: */
+ [0 /* (Invalid) */] = TCP_CLOSE,
+ [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_SYN_SENT] = TCP_CLOSE,
+ [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
+ [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
+ [TCP_TIME_WAIT] = TCP_CLOSE,
+ [TCP_CLOSE] = TCP_CLOSE,
+ [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
+ [TCP_LAST_ACK] = TCP_LAST_ACK,
+ [TCP_LISTEN] = TCP_CLOSE,
+ [TCP_CLOSING] = TCP_CLOSING,
+ [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
+};
+
+static int tcp_close_state(struct sock *sk)
+{
+ int next = (int)new_state[sk->sk_state];
+ int ns = next & TCP_STATE_MASK;
+
+ tcp_set_state(sk, ns);
+
+ return next & TCP_ACTION_FIN;
+}
+
+/*
+ * Shutdown the sending side of a connection. Much like close except
+ * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+ /* We need to grab some memory, and put together a FIN,
+ * and then put it into the queue to be sent.
+ * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+ */
+ if (!(how & SEND_SHUTDOWN))
+ return;
+
+ /* If we've already sent a FIN, or it's a closed state, skip this. */
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+ TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ /* Clear out any half completed packets. FIN if needed. */
+ if (tcp_close_state(sk))
+ tcp_send_fin(sk);
+ }
+}
+EXPORT_SYMBOL(tcp_shutdown);
+
+bool tcp_check_oom(struct sock *sk, int shift)
+{
+ bool too_many_orphans, out_of_socket_memory;
+
+ too_many_orphans = tcp_too_many_orphans(sk, shift);
+ out_of_socket_memory = tcp_out_of_memory(sk);
+
+ if (too_many_orphans)
+ net_info_ratelimited("too many orphaned sockets\n");
+ if (out_of_socket_memory)
+ net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
+ return too_many_orphans || out_of_socket_memory;
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ struct sk_buff *skb;
+ int data_was_unread = 0;
+ int state;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* Special case. */
+ inet_csk_listen_stop(sk);
+
+ goto adjudge_to_death;
+ }
+
+ /* We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ * reader process may not have drained the data yet!
+ */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ len--;
+ data_was_unread += len;
+ __kfree_skb(skb);
+ }
+
+ sk_mem_reclaim(sk);
+
+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+ if (sk->sk_state == TCP_CLOSE)
+ goto adjudge_to_death;
+
+ /* As outlined in RFC 2525, section 2.17, we send a RST here because
+ * data was lost. To witness the awful effects of the old behavior of
+ * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
+ * GET in an FTP client, suspend the process, wait for the client to
+ * advertise a zero window, then kill -9 the FTP client, wheee...
+ * Note: timeout is always zero in such a case.
+ */
+ if (unlikely(tcp_sk(sk)->repair)) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
+ /* Unread data was tossed, zap the connection. */
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, sk->sk_allocation);
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ } else if (tcp_close_state(sk)) {
+ /* We FIN if the application ate all the data before
+ * zapping the connection.
+ */
+
+ /* RED-PEN. Formally speaking, we have broken TCP state
+ * machine. State transitions:
+ *
+ * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+ *
+ * are legal only when FIN has been sent (i.e. in window),
+ * rather than queued out of window. Purists blame.
+ *
+ * F.e. "RFC state" is ESTABLISHED,
+ * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+ *
+ * The visible declinations are that sometimes
+ * we enter time-wait state, when it is not required really
+ * (harmless), do not send active resets, when they are
+ * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+ * they look as CLOSING or LAST_ACK for Linux)
+ * Probably, I missed some more holelets.
+ * --ANK
+ * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+ * in a single packet! (May consider it later but will
+ * probably need API support or TCP_CORK SYN-ACK until
+ * data is written and socket is closed.)
+ */
+ tcp_send_fin(sk);
+ }
+
+ sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+ state = sk->sk_state;
+ sock_hold(sk);
+ sock_orphan(sk);
+
+ /* It is the last release_sock in its life. It will remove backlog. */
+ release_sock(sk);
+
+
+ /* Now socket is owned by kernel and we acquire BH lock
+ to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ WARN_ON(sock_owned_by_user(sk));
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ /* Have we already been destroyed by a softirq or backlog? */
+ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ /* This is a (useful) BSD violating of the RFC. There is a
+ * problem with TCP as specified in that the other end could
+ * keep a socket open forever with no application left this end.
+ * We use a 1 minute timeout (about the same as BSD) then kill
+ * our end. If they send after that then tough - BUT: long enough
+ * that we won't make the old 4*rto = almost no time - whoops
+ * reset mistake.
+ *
+ * Nope, it was not mistake. It is really desired behaviour
+ * f.e. on http servers, when such sockets are useless, but
+ * consume significant resources. Let's do it with special
+ * linger2 option. --ANK
+ */
+
+ if (sk->sk_state == TCP_FIN_WAIT2) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->linger2 < 0) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPABORTONLINGER);
+ } else {
+ const int tmo = tcp_fin_time(sk);
+
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk,
+ tmo - TCP_TIMEWAIT_LEN);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ }
+ if (sk->sk_state != TCP_CLOSE) {
+ sk_mem_reclaim(sk);
+ if (tcp_check_oom(sk, 0)) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPABORTONMEMORY);
+ }
+ }
+
+ if (sk->sk_state == TCP_CLOSE) {
+ struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ /* We could get here with a non-NULL req if the socket is
+ * aborted (e.g., closed with unread data) before 3WHS
+ * finishes.
+ */
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
+ inet_csk_destroy_sock(sk);
+ }
+ /* Otherwise, socket is reprieved until protocol close. */
+
+out:
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+EXPORT_SYMBOL(tcp_close);
+
+/* These states need RST on ABORT according to RFC793 */
+
+static inline bool tcp_need_reset(int state)
+{
+ return (1 << state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = 0;
+ int old_state = sk->sk_state;
+
+ if (old_state != TCP_CLOSE)
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* ABORT function of RFC793 */
+ if (old_state == TCP_LISTEN) {
+ inet_csk_listen_stop(sk);
+ } else if (unlikely(tp->repair)) {
+ sk->sk_err = ECONNABORTED;
+ } else if (tcp_need_reset(old_state) ||
+ (tp->snd_nxt != tp->write_seq &&
+ (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+ /* The last check adjusts for discrepancy of Linux wrt. RFC
+ * states
+ */
+ tcp_send_active_reset(sk, gfp_any());
+ sk->sk_err = ECONNRESET;
+ } else if (old_state == TCP_SYN_SENT)
+ sk->sk_err = ECONNRESET;
+
+ tcp_clear_xmit_timers(sk);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ tcp_write_queue_purge(sk);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ inet->inet_dport = 0;
+
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->srtt_us = 0;
+ if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq = 1;
+ icsk->icsk_backoff = 0;
+ tp->snd_cwnd = 2;
+ icsk->icsk_probes_out = 0;
+ tp->packets_out = 0;
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_cnt = 0;
+ tp->window_clamp = 0;
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ tcp_clear_retrans(tp);
+ inet_csk_delack_init(sk);
+ tcp_init_send_head(sk);
+ memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
+ __sk_dst_reset(sk);
+
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+ sk->sk_error_report(sk);
+ return err;
+}
+EXPORT_SYMBOL(tcp_disconnect);
+
+void tcp_sock_destruct(struct sock *sk)
+{
+ inet_sock_destruct(sk);
+
+ kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
+}
+
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+ return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+}
+
+static int tcp_repair_options_est(struct tcp_sock *tp,
+ struct tcp_repair_opt __user *optbuf, unsigned int len)
+{
+ struct tcp_repair_opt opt;
+
+ while (len >= sizeof(opt)) {
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ optbuf++;
+ len -= sizeof(opt);
+
+ switch (opt.opt_code) {
+ case TCPOPT_MSS:
+ tp->rx_opt.mss_clamp = opt.opt_val;
+ break;
+ case TCPOPT_WINDOW:
+ {
+ u16 snd_wscale = opt.opt_val & 0xFFFF;
+ u16 rcv_wscale = opt.opt_val >> 16;
+
+ if (snd_wscale > 14 || rcv_wscale > 14)
+ return -EFBIG;
+
+ tp->rx_opt.snd_wscale = snd_wscale;
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rx_opt.wscale_ok = 1;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.tstamp_ok = 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_TCP_STEALTH
+int tcp_stealth_integrity(__be16 *hash, u8 *secret, u8 *payload, int len)
+{
+ struct scatterlist sg[2];
+ struct crypto_hash *tfm;
+ struct hash_desc desc;
+ __be16 h[MD5_DIGEST_WORDS * 2];
+ int i;
+ int err = 0;
+
+ tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ err = -PTR_ERR(tfm);
+ goto out;
+ }
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(sg, 2);
+ sg_set_buf(&sg[0], secret, MD5_MESSAGE_BYTES);
+ sg_set_buf(&sg[1], payload, len);
+
+ if (crypto_hash_digest(&desc, sg, MD5_MESSAGE_BYTES + len, (u8 *)h)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ *hash = be16_to_cpu(h[0]);
+ for (i = 1; i < MD5_DIGEST_WORDS * 2; i++)
+ *hash ^= be16_to_cpu(h[i]);
+
+out:
+ crypto_free_hash(tfm);
+ return err;
+}
+#endif
+
+/*
+ * Socket option code for TCP.
+ */
+static int do_tcp_setsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int val;
+ int err = 0;
+
+ /* These are data/string values, all the others are ints */
+ switch (optname) {
+ case TCP_CONGESTION: {
+ char name[TCP_CA_NAME_MAX];
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ val = strncpy_from_user(name, optval,
+ min_t(long, TCP_CA_NAME_MAX-1, optlen));
+ if (val < 0)
+ return -EFAULT;
+ name[val] = 0;
+
+ lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name);
+ release_sock(sk);
+ return err;
+ }
+#ifdef CONFIG_TCP_STEALTH
+ case TCP_STEALTH: {
+ u8 secret[MD5_MESSAGE_BYTES] = { 0 };
+
+ val = copy_from_user(secret, optval,
+ min_t(unsigned int, optlen,
+ MD5_MESSAGE_BYTES));
+ if (val != 0)
+ return -EFAULT;
+
+ lock_sock(sk);
+ memcpy(tp->stealth.secret, secret, MD5_MESSAGE_BYTES);
+ tp->stealth.mode = TCP_STEALTH_MODE_AUTH;
+ tp->stealth.mstamp.v64 = 0;
+ tp->stealth.saw_tsval = false;
+ release_sock(sk);
+ return err;
+ }
+ case TCP_STEALTH_INTEGRITY: {
+ u8 *payload;
+
+ lock_sock(sk);
+
+ if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+ err = -EOPNOTSUPP;
+ goto stealth_integrity_out_1;
+ }
+
+ if (optlen < 1 || optlen > USHRT_MAX) {
+ err = -EINVAL;
+ goto stealth_integrity_out_1;
+ }
+
+ payload = vmalloc(optlen);
+ if (!payload) {
+ err = -ENOMEM;
+ goto stealth_integrity_out_1;
+ }
+
+ val = copy_from_user(payload, optval, optlen);
+ if (val != 0) {
+ err = -EFAULT;
+ goto stealth_integrity_out_2;
+ }
+
+ err = tcp_stealth_integrity(&tp->stealth.integrity_hash,
+ tp->stealth.secret, payload,
+ optlen);
+ if (err)
+ goto stealth_integrity_out_2;
+
+ tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY;
+
+stealth_integrity_out_2:
+ vfree(payload);
+stealth_integrity_out_1:
+ release_sock(sk);
+ return err;
+ }
+#endif
+ default:
+ /* fallthru */
+ break;
+ }
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case TCP_MAXSEG:
+ /* Values greater than interface MTU won't take effect. However
+ * at the point when this call is done we typically don't yet
+ * know which interface is going to be used */
+ if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
+ err = -EINVAL;
+ break;
+ }
+ tp->rx_opt.user_mss = val;
+ break;
+
+ case TCP_NODELAY:
+ if (val) {
+ /* TCP_NODELAY is weaker than TCP_CORK, so that
+ * this option on corked socket is remembered, but
+ * it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make
+ * an explicit push, which overrides even TCP_CORK
+ * for currently queued segments.
+ */
+ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_OFF;
+ }
+ break;
+
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->thin_lto = val;
+ break;
+
+ case TCP_THIN_DUPACK:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else {
+ tp->thin_dupack = val;
+ if (tp->thin_dupack)
+ tcp_disable_early_retrans(tp);
+ }
+ break;
+
+ case TCP_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ err = -EPERM;
+ else if (val == 1) {
+ tp->repair = 1;
+ sk->sk_reuse = SK_FORCE_REUSE;
+ tp->repair_queue = TCP_NO_QUEUE;
+ } else if (val == 0) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ tcp_send_window_probe(sk);
+ } else
+ err = -EINVAL;
+
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (!tp->repair)
+ err = -EPERM;
+ else if (val < TCP_QUEUES_NR)
+ tp->repair_queue = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (sk->sk_state != TCP_CLOSE)
+ err = -EPERM;
+ else if (tp->repair_queue == TCP_SEND_QUEUE)
+ tp->write_seq = val;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ tp->rcv_nxt = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_REPAIR_OPTIONS:
+ if (!tp->repair)
+ err = -EINVAL;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ err = tcp_repair_options_est(tp,
+ (struct tcp_repair_opt __user *)optval,
+ optlen);
+ else
+ err = -EPERM;
+ break;
+
+ case TCP_CORK:
+ /* When set indicates to always queue non-full frames.
+ * Later the user clears this option and we transmit
+ * any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly
+ * filled frames when the user (for example) must write
+ * out headers with a write() call first and then use
+ * sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is
+ * stronger than TCP_NODELAY.
+ */
+ if (val) {
+ tp->nonagle |= TCP_NAGLE_CORK;
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_CORK;
+ if (tp->nonagle&TCP_NAGLE_OFF)
+ tp->nonagle |= TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ }
+ break;
+
+ case TCP_KEEPIDLE:
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ err = -EINVAL;
+ else {
+ tp->keepalive_time = val * HZ;
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
+ !((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))) {
+ u32 elapsed = keepalive_time_elapsed(tp);
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ inet_csk_reset_keepalive_timer(sk, elapsed);
+ }
+ }
+ break;
+ case TCP_KEEPINTVL:
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ err = -EINVAL;
+ else
+ tp->keepalive_intvl = val * HZ;
+ break;
+ case TCP_KEEPCNT:
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ err = -EINVAL;
+ else
+ tp->keepalive_probes = val;
+ break;
+ case TCP_SYNCNT:
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ err = -EINVAL;
+ else
+ icsk->icsk_syn_retries = val;
+ break;
+
+ case TCP_LINGER2:
+ if (val < 0)
+ tp->linger2 = -1;
+ else if (val > sysctl_tcp_fin_timeout / HZ)
+ tp->linger2 = 0;
+ else
+ tp->linger2 = val * HZ;
+ break;
+
+ case TCP_DEFER_ACCEPT:
+ /* Translate value in seconds to number of retransmits */
+ icsk->icsk_accept_queue.rskq_defer_accept =
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
+ break;
+
+ case TCP_WINDOW_CLAMP:
+ if (!val) {
+ if (sk->sk_state != TCP_CLOSE) {
+ err = -EINVAL;
+ break;
+ }
+ tp->window_clamp = 0;
+ } else
+ tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+ SOCK_MIN_RCVBUF / 2 : val;
+ break;
+
+ case TCP_QUICKACK:
+ if (!val) {
+ icsk->icsk_ack.pingpong = 1;
+ } else {
+ icsk->icsk_ack.pingpong = 0;
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ inet_csk_ack_scheduled(sk)) {
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ tcp_cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ icsk->icsk_ack.pingpong = 1;
+ }
+ }
+ break;
+
+#ifdef CONFIG_TCP_MD5SIG
+ case TCP_MD5SIG:
+ /* Read the IP->Key mappings from userspace */
+ err = tp->af_specific->md5_parse(sk, optval, optlen);
+ break;
+#endif
+ case TCP_USER_TIMEOUT:
+ /* Cap the max time in ms TCP will retry or probe the window
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ if (val < 0)
+ err = -EINVAL;
+ else
+ icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ break;
+
+ case TCP_FASTOPEN:
+ if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+ TCPF_LISTEN))) {
+ tcp_fastopen_init_key_once(true);
+
+ err = fastopen_init_queue(sk, val);
+ } else {
+ err = -EINVAL;
+ }
+ break;
+ case TCP_TIMESTAMP:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->tsoffset = val - tcp_time_stamp;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
+ break;
+#ifdef CONFIG_TCP_STEALTH
+ case TCP_STEALTH_INTEGRITY_LEN:
+ if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+ err = -EOPNOTSUPP;
+ } else if (val < 1 || val > USHRT_MAX) {
+ err = -EINVAL;
+ } else {
+ tp->stealth.integrity_len = val;
+ tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY_LEN;
+ }
+ break;
+#endif
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ unsigned int optlen)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (level != SOL_TCP)
+ return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_TCP)
+ return inet_csk_compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_tcp_setsockopt);
+#endif
+
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 now = tcp_time_stamp;
+ unsigned int start;
+ u32 rate;
+
+ memset(info, 0, sizeof(*info));
+
+ info->tcpi_state = sk->sk_state;
+ info->tcpi_ca_state = icsk->icsk_ca_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
+
+ if (tp->rx_opt.tstamp_ok)
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tcp_is_sack(tp))
+ info->tcpi_options |= TCPI_OPT_SACK;
+ if (tp->rx_opt.wscale_ok) {
+ info->tcpi_options |= TCPI_OPT_WSCALE;
+ info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+ info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+ }
+
+ if (tp->ecn_flags & TCP_ECN_OK)
+ info->tcpi_options |= TCPI_OPT_ECN;
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+ if (tp->syn_data_acked)
+ info->tcpi_options |= TCPI_OPT_SYN_DATA;
+
+ info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+ info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+ info->tcpi_snd_mss = tp->mss_cache;
+ info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ info->tcpi_unacked = sk->sk_ack_backlog;
+ info->tcpi_sacked = sk->sk_max_ack_backlog;
+ } else {
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+ }
+ info->tcpi_lost = tp->lost_out;
+ info->tcpi_retrans = tp->retrans_out;
+ info->tcpi_fackets = tp->fackets_out;
+
+ info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+ info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
+ info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
+ info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+ info->tcpi_rtt = tp->srtt_us >> 3;
+ info->tcpi_rttvar = tp->mdev_us >> 2;
+ info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_advmss = tp->advmss;
+ info->tcpi_reordering = tp->reordering;
+
+ info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+ info->tcpi_rcv_space = tp->rcvq_space.space;
+
+ info->tcpi_total_retrans = tp->total_retrans;
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tp->syncp);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+}
+EXPORT_SYMBOL_GPL(tcp_get_info);
+
+static int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, int __user *optlen)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int val, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_MAXSEG:
+ val = tp->mss_cache;
+ if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ val = tp->rx_opt.user_mss;
+ if (tp->repair)
+ val = tp->rx_opt.mss_clamp;
+ break;
+ case TCP_NODELAY:
+ val = !!(tp->nonagle&TCP_NAGLE_OFF);
+ break;
+ case TCP_CORK:
+ val = !!(tp->nonagle&TCP_NAGLE_CORK);
+ break;
+ case TCP_KEEPIDLE:
+ val = keepalive_time_when(tp) / HZ;
+ break;
+ case TCP_KEEPINTVL:
+ val = keepalive_intvl_when(tp) / HZ;
+ break;
+ case TCP_KEEPCNT:
+ val = keepalive_probes(tp);
+ break;
+ case TCP_SYNCNT:
+ val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ break;
+ case TCP_LINGER2:
+ val = tp->linger2;
+ if (val >= 0)
+ val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+ break;
+ case TCP_DEFER_ACCEPT:
+ val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
+ TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+ break;
+ case TCP_WINDOW_CLAMP:
+ val = tp->window_clamp;
+ break;
+ case TCP_INFO: {
+ struct tcp_info info;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ tcp_get_info(sk, &info);
+
+ len = min_t(unsigned int, len, sizeof(info));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_CC_INFO: {
+ const struct tcp_congestion_ops *ca_ops;
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ ca_ops = icsk->icsk_ca_ops;
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ~0U, &attr, &info);
+
+ len = min_t(unsigned int, len, sz);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_QUICKACK:
+ val = !icsk->icsk_ack.pingpong;
+ break;
+
+ case TCP_CONGESTION:
+ if (get_user(len, optlen))
+ return -EFAULT;
+ len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+ return -EFAULT;
+ return 0;
+
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ val = tp->thin_lto;
+ break;
+ case TCP_THIN_DUPACK:
+ val = tp->thin_dupack;
+ break;
+
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (tp->repair)
+ val = tp->repair_queue;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ val = tp->write_seq;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ val = tp->rcv_nxt;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_USER_TIMEOUT:
+ val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ break;
+
+ case TCP_FASTOPEN:
+ if (icsk->icsk_accept_queue.fastopenq)
+ val = icsk->icsk_accept_queue.fastopenq->max_qlen;
+ else
+ val = 0;
+ break;
+
+ case TCP_TIMESTAMP:
+ val = tcp_time_stamp + tp->tsoffset;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ val = tp->notsent_lowat;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ int __user *optlen)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (level != SOL_TCP)
+ return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_TCP)
+ return inet_csk_compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_tcp_getsockopt);
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
+static DEFINE_MUTEX(tcp_md5sig_mutex);
+static bool tcp_md5sig_pool_populated = false;
+
+static void __tcp_alloc_md5sig_pool(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
+ struct crypto_hash *hash;
+
+ hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(hash))
+ return;
+ per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
+ }
+ }
+ /* before setting tcp_md5sig_pool_populated, we must commit all writes
+ * to memory. See smp_rmb() in tcp_get_md5sig_pool()
+ */
+ smp_wmb();
+ tcp_md5sig_pool_populated = true;
+}
+
+bool tcp_alloc_md5sig_pool(void)
+{
+ if (unlikely(!tcp_md5sig_pool_populated)) {
+ mutex_lock(&tcp_md5sig_mutex);
+
+ if (!tcp_md5sig_pool_populated)
+ __tcp_alloc_md5sig_pool();
+
+ mutex_unlock(&tcp_md5sig_mutex);
+ }
+ return tcp_md5sig_pool_populated;
+}
+EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
+
+
+/**
+ * tcp_get_md5sig_pool - get md5sig_pool for this user
+ *
+ * We use percpu structure, so if we succeed, we exit with preemption
+ * and BH disabled, to make sure another thread or softirq handling
+ * wont try to get same context.
+ */
+struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
+{
+ local_bh_disable();
+
+ if (tcp_md5sig_pool_populated) {
+ /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
+ smp_rmb();
+ return this_cpu_ptr(&tcp_md5sig_pool);
+ }
+ local_bh_enable();
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_get_md5sig_pool);
+
+int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
+ const struct tcphdr *th)
+{
+ struct scatterlist sg;
+ struct tcphdr hdr;
+ int err;
+
+ /* We are not allowed to change tcphdr, make a local copy */
+ memcpy(&hdr, th, sizeof(hdr));
+ hdr.check = 0;
+
+ /* options aren't included in the hash */
+ sg_init_one(&sg, &hdr, sizeof(hdr));
+ err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
+ return err;
+}
+EXPORT_SYMBOL(tcp_md5_hash_header);
+
+int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
+ const struct sk_buff *skb, unsigned int header_len)
+{
+ struct scatterlist sg;
+ const struct tcphdr *tp = tcp_hdr(skb);
+ struct hash_desc *desc = &hp->md5_desc;
+ unsigned int i;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+
+ sg_init_table(&sg, 1);
+
+ sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
+ if (crypto_hash_update(desc, &sg, head_data_len))
+ return 1;
+
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const struct skb_frag_struct *f = &shi->frags[i];
+ unsigned int offset = f->page_offset;
+ struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+
+ sg_set_page(&sg, page, skb_frag_size(f),
+ offset_in_page(offset));
+ if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
+ return 1;
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_md5_hash_skb_data);
+
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, key->key, key->keylen);
+ return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
+}
+EXPORT_SYMBOL(tcp_md5_hash_key);
+
+#endif
+
+void tcp_done(struct sock *sk)
+{
+ struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+
+ if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_clear_xmit_timers(sk);
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_done);
+
+extern struct tcp_congestion_ops tcp_reno;
+
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtoul(str, 0, &thash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+
+static void __init tcp_init_mem(void)
+{
+ unsigned long limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_tcp_mem[0] = limit / 4 * 3;
+ sysctl_tcp_mem[1] = limit;
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+}
+
+void __init tcp_init(void)
+{
+ unsigned long limit;
+ int max_rshare, max_wshare, cnt;
+ unsigned int i;
+
+ sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+
+ percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
+ percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+ tcp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+ /* Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ tcp_hashinfo.ehash =
+ alloc_large_system_hash("TCP established",
+ sizeof(struct inet_ehash_bucket),
+ thash_entries,
+ 17, /* one slot per 128 KB of memory */
+ 0,
+ NULL,
+ &tcp_hashinfo.ehash_mask,
+ 0,
+ thash_entries ? 0 : 512 * 1024);
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
+ INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+
+ if (inet_ehash_locks_alloc(&tcp_hashinfo))
+ panic("TCP: failed to alloc ehash_locks");
+ tcp_hashinfo.bhash =
+ alloc_large_system_hash("TCP bind",
+ sizeof(struct inet_bind_hashbucket),
+ tcp_hashinfo.ehash_mask + 1,
+ 17, /* one slot per 128 KB of memory */
+ 0,
+ &tcp_hashinfo.bhash_size,
+ NULL,
+ 0,
+ 64 * 1024);
+ tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+ for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ }
+
+
+ cnt = tcp_hashinfo.ehash_mask + 1;
+
+ tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+ sysctl_tcp_max_orphans = cnt / 2;
+ sysctl_max_syn_backlog = max(128, cnt / 256);
+
+ tcp_init_mem();
+ /* Set per-socket limits to no more than 1/128 the pressure threshold */
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+ max_wshare = min(4UL*1024*1024, limit);
+ max_rshare = min(6UL*1024*1024, limit);
+
+ sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ sysctl_tcp_wmem[1] = 16*1024;
+ sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+
+ sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ sysctl_tcp_rmem[1] = 87380;
+ sysctl_tcp_rmem[2] = max(87380, max_rshare);
+
+ pr_info("Hash tables configured (established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+
+ tcp_metrics_init();
+ BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
+ tcp_tasklet_init();
+}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000..c037644ea
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,239 @@
+/*
+ * Binary Increase Congestion control for TCP
+ * Home page:
+ * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ * "Binary Increase Congestion Control for Fast, Long Distance
+ * Networks" in InfoComm 2004
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_B 4 /*
+ * In binary search,
+ * go to point (max+min)/N
+ */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int low_window = 14;
+static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 epoch_start; /* beginning of an epoch */
+#define ACK_RATIO_SHIFT 4
+ u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->epoch_start = 0;
+ ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
+
+ if (initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) /* record the beginning of an epoch */
+ ca->epoch_start = tcp_time_stamp;
+
+ /* start off normal */
+ if (cwnd <= low_window) {
+ ca->cnt = cwnd;
+ return;
+ }
+
+ /* binary increase */
+ if (cwnd < ca->last_max_cwnd) {
+ __u32 dist = (ca->last_max_cwnd - cwnd)
+ / BICTCP_B;
+
+ if (dist > max_increment)
+ /* linear increase */
+ ca->cnt = cwnd / max_increment;
+ else if (dist <= 1U)
+ /* binary search increase */
+ ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+ else
+ /* binary search increase */
+ ca->cnt = cwnd / dist;
+ } else {
+ /* slow start AMD linear increase */
+ if (cwnd < ca->last_max_cwnd + BICTCP_B)
+ /* slow start */
+ ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+ else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+ /* slow start */
+ ca->cnt = (cwnd * (BICTCP_B-1))
+ / (cwnd - ca->last_max_cwnd);
+ else
+ /* linear increase */
+ ca->cnt = cwnd / max_increment;
+ }
+
+ /* if in slow start or link utilization is very low */
+ if (ca->last_max_cwnd == 0) {
+ if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+ ca->cnt = 20;
+ }
+
+ ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+ if (ca->cnt == 0) /* cannot be zero */
+ ca->cnt = 1;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else {
+ bictcp_update(ca, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, ca->cnt, 1);
+ }
+}
+
+/*
+ * behave like Reno until low_window is reached,
+ * then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+ if (tp->snd_cwnd <= low_window)
+ return max(tp->snd_cwnd >> 1U, 2U);
+ else
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct bictcp *ca = inet_csk_ca(sk);
+
+ return max(tp->snd_cwnd, ca->loss_cwnd);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss)
+ bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+ ca->delayed_ack += cnt;
+ }
+}
+
+static struct tcp_congestion_ops bictcp __read_mostly = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "bic",
+};
+
+static int __init bictcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000..84be008c9
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,441 @@
+/*
+ * Pluggable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler support and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (strcmp(e->name, name) == 0)
+ return e;
+ }
+
+ return NULL;
+}
+
+/* Must be called with rcu lock held */
+static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+{
+ const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("tcp_%s", name);
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ }
+#endif
+ return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (e->key == key)
+ return e;
+ }
+
+ return NULL;
+}
+
+/*
+ * Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret = 0;
+
+ /* all algorithms must implement ssthresh and cong_avoid ops */
+ if (!ca->ssthresh || !ca->cong_avoid) {
+ pr_err("%s does not implement required ops\n", ca->name);
+ return -EINVAL;
+ }
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+ pr_notice("%s already registered or non-unique key\n",
+ ca->name);
+ ret = -EEXIST;
+ } else {
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ pr_debug("%s registered\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function. Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+ spin_lock(&tcp_cong_list_lock);
+ list_del_rcu(&ca->list);
+ spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module gets removed entirely.
+ *
+ * A try_module_get() should fail by now as our module is
+ * in "going" state since no refs are held anymore and
+ * module_exit() handler being called.
+ */
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+u32 tcp_ca_get_key_by_name(const char *name)
+{
+ const struct tcp_congestion_ops *ca;
+ u32 key;
+
+ might_sleep();
+
+ rcu_read_lock();
+ ca = __tcp_ca_find_autoload(name);
+ key = ca ? ca->key : TCP_CA_UNSPEC;
+ rcu_read_unlock();
+
+ return key;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ const struct tcp_congestion_ops *ca;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(key);
+ if (ca)
+ ret = strncpy(buffer, ca->name,
+ TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
+
+/* Assign choice of congestion control. */
+void tcp_assign_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (likely(try_module_get(ca->owner))) {
+ icsk->icsk_ca_ops = ca;
+ goto out;
+ }
+ /* Fallback to next available. The last really
+ * guaranteed fallback is Reno from this list.
+ */
+ }
+out:
+ rcu_read_unlock();
+
+ /* Clear out private data before diag gets it and
+ * the ca has not been initialized.
+ */
+ if (ca->get_info)
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
+static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+ icsk->icsk_ca_setsockopt = 1;
+
+ if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ module_put(icsk->icsk_ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+ struct tcp_congestion_ops *ca;
+ int ret = -ENOENT;
+
+ spin_lock(&tcp_cong_list_lock);
+ ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ spin_unlock(&tcp_cong_list_lock);
+
+ request_module("tcp_%s", name);
+ spin_lock(&tcp_cong_list_lock);
+ ca = tcp_ca_find(name);
+ }
+#endif
+
+ if (ca) {
+ ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
+ list_move(&ca->list, &tcp_cong_list);
+ ret = 0;
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+
+/* Set default value from kernel configuration at bootup */
+static int __init tcp_congestion_default(void)
+{
+ return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+}
+late_initcall(tcp_congestion_default);
+
+/* Build string with list of available congestion control values */
+void tcp_get_available_congestion_control(char *buf, size_t maxlen)
+{
+ struct tcp_congestion_ops *ca;
+ size_t offs = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ca->name);
+ }
+ rcu_read_unlock();
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+ struct tcp_congestion_ops *ca;
+ /* We will always have reno... */
+ BUG_ON(list_empty(&tcp_cong_list));
+
+ rcu_read_lock();
+ ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+ strncpy(name, ca->name, TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+}
+
+/* Built list of non-restricted congestion control values */
+void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
+{
+ struct tcp_congestion_ops *ca;
+ size_t offs = 0;
+
+ *buf = '\0';
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
+ continue;
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ca->name);
+ }
+ rcu_read_unlock();
+}
+
+/* Change list of non-restricted congestion control */
+int tcp_set_allowed_congestion_control(char *val)
+{
+ struct tcp_congestion_ops *ca;
+ char *saved_clone, *clone, *name;
+ int ret = 0;
+
+ saved_clone = clone = kstrdup(val, GFP_USER);
+ if (!clone)
+ return -ENOMEM;
+
+ spin_lock(&tcp_cong_list_lock);
+ /* pass 1 check for bad entries */
+ while ((name = strsep(&clone, " ")) && *name) {
+ ca = tcp_ca_find(name);
+ if (!ca) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ /* pass 2 clear old values */
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list)
+ ca->flags &= ~TCP_CONG_NON_RESTRICTED;
+
+ /* pass 3 mark as allowed */
+ while ((name = strsep(&val, " ")) && *name) {
+ ca = tcp_ca_find(name);
+ WARN_ON(!ca);
+ if (ca)
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
+ }
+out:
+ spin_unlock(&tcp_cong_list_lock);
+ kfree(saved_clone);
+
+ return ret;
+}
+
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct sock *sk, const char *name)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ int err = 0;
+
+ if (icsk->icsk_ca_dst_locked)
+ return -EPERM;
+
+ rcu_read_lock();
+ ca = __tcp_ca_find_autoload(name);
+ /* No change asking for existing value */
+ if (ca == icsk->icsk_ca_ops) {
+ icsk->icsk_ca_setsockopt = 1;
+ goto out;
+ }
+ if (!ca)
+ err = -ENOENT;
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
+ err = -EPERM;
+ else if (!try_module_get(ca->owner))
+ err = -EBUSY;
+ else
+ tcp_reinit_congestion_control(sk, ca);
+ out:
+ rcu_read_unlock();
+ return err;
+}
+
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
+ */
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
+{
+ u32 cwnd = tp->snd_cwnd + acked;
+
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+ return acked;
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
+ * for every packet that was ACKed.
+ */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+{
+ /* If credits accumulated at a higher w, apply them gently now. */
+ if (tp->snd_cwnd_cnt >= w) {
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd++;
+ }
+
+ tp->snd_cwnd_cnt += acked;
+ if (tp->snd_cwnd_cnt >= w) {
+ u32 delta = tp->snd_cwnd_cnt / w;
+
+ tp->snd_cwnd_cnt -= delta * w;
+ tp->snd_cwnd += delta;
+ }
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ /* In dangerous area, increase slowly. */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+struct tcp_congestion_ops tcp_reno = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "reno",
+ .owner = THIS_MODULE,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+};
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000..06d3d665a
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,504 @@
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
+ * Home page:
+ * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of CUBIC TCP in
+ * Sangtae Ha, Injong Rhee and Lisong Xu,
+ * "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
+ * in ACM SIGOPS Operating System Review, July 2008.
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
+ *
+ * CUBIC integrates a new slow start algorithm, called HyStart.
+ * The details of HyStart are presented in
+ * Sangtae Ha and Injong Rhee,
+ * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
+ *
+ * All testing results are available from:
+ * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
+
+/* Two methods of hybrid slow start */
+#define HYSTART_ACK_TRAIN 0x1
+#define HYSTART_DELAY 0x2
+
+/* Number of delay samples for detecting the increase of delay */
+#define HYSTART_MIN_SAMPLES 8
+#define HYSTART_DELAY_MIN (4U<<3)
+#define HYSTART_DELAY_MAX (16U<<3)
+#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
+
+static int fast_convergence __read_mostly = 1;
+static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh __read_mostly;
+static int bic_scale __read_mostly = 41;
+static int tcp_friendliness __read_mostly = 1;
+
+static int hystart __read_mostly = 1;
+static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
+static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta __read_mostly = 2;
+
+static u32 cube_rtt_scale __read_mostly;
+static u32 beta_scale __read_mostly;
+static u64 cube_factor __read_mostly;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+module_param(hystart, int, 0644);
+MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
+module_param(hystart_detect, int, 0644);
+MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
+ " 1: packet-train 2: delay 3: both packet-train and delay");
+module_param(hystart_low_window, int, 0644);
+MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 bic_origin_point;/* origin point of bic function */
+ u32 bic_K; /* time to origin point
+ from the beginning of the current epoch */
+ u32 delay_min; /* min delay (msec << 3) */
+ u32 epoch_start; /* beginning of an epoch */
+ u32 ack_cnt; /* number of acks */
+ u32 tcp_cwnd; /* estimated tcp cwnd */
+ u16 unused;
+ u8 sample_cnt; /* number of samples to decide curr_rtt */
+ u8 found; /* the exit point is found? */
+ u32 round_start; /* beginning of each round */
+ u32 end_seq; /* end_seq of the round */
+ u32 last_ack; /* last time when the ACK spacing is close */
+ u32 curr_rtt; /* the minimum rtt of current round */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->bic_origin_point = 0;
+ ca->bic_K = 0;
+ ca->delay_min = 0;
+ ca->epoch_start = 0;
+ ca->ack_cnt = 0;
+ ca->tcp_cwnd = 0;
+ ca->found = 0;
+}
+
+static inline u32 bictcp_clock(void)
+{
+#if HZ < 1000
+ return ktime_to_ms(ktime_get_real());
+#else
+ return jiffies_to_msecs(jiffies);
+#endif
+}
+
+static inline void bictcp_hystart_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->round_start = ca->last_ack = bictcp_clock();
+ ca->end_seq = tp->snd_nxt;
+ ca->curr_rtt = 0;
+ ca->sample_cnt = 0;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
+
+ if (hystart)
+ bictcp_hystart_reset(sk);
+
+ if (!hystart && initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
+ */
+static u32 cubic_root(u64 a)
+{
+ u32 x, b, shift;
+ /*
+ * cbrt(x) MSB values for x MSB values in [0..63].
+ * Precomputed then refined by hand - Willy Tarreau
+ *
+ * For x in [0..63],
+ * v = cbrt(x << 18) - 1
+ * cbrt(x) = (v[x] + 10) >> 6
+ */
+ static const u8 v[] = {
+ /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
+ /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
+ /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
+ /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
+ /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
+ /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
+ /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
+ /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
+ };
+
+ b = fls64(a);
+ if (b < 7) {
+ /* a in [0..63] */
+ return ((u32)v[(u32)a] + 35) >> 6;
+ }
+
+ b = ((b * 84) >> 8) - 1;
+ shift = (a >> (b * 3));
+
+ x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
+
+ /*
+ * Newton-Raphson iteration
+ * 2
+ * x = ( 2 * x + a / x ) / 3
+ * k+1 k k
+ */
+ x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
+ x = ((x * 341) >> 10);
+ return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
+{
+ u32 delta, bic_target, max_cnt;
+ u64 offs, t;
+
+ ca->ack_cnt += acked; /* count the number of ACKed packets */
+
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ /* The CUBIC function can update ca->cnt at most once per jiffy.
+ * On all cwnd reduction events, ca->epoch_start is set to 0,
+ * which will force a recalculation of ca->cnt.
+ */
+ if (ca->epoch_start && tcp_time_stamp == ca->last_time)
+ goto tcp_friendliness;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) {
+ ca->epoch_start = tcp_time_stamp; /* record beginning */
+ ca->ack_cnt = acked; /* start counting */
+ ca->tcp_cwnd = cwnd; /* syn with cubic */
+
+ if (ca->last_max_cwnd <= cwnd) {
+ ca->bic_K = 0;
+ ca->bic_origin_point = cwnd;
+ } else {
+ /* Compute new K based on
+ * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+ */
+ ca->bic_K = cubic_root(cube_factor
+ * (ca->last_max_cwnd - cwnd));
+ ca->bic_origin_point = ca->last_max_cwnd;
+ }
+ }
+
+ /* cubic function - calc*/
+ /* calculate c * time^3 / rtt,
+ * while considering overflow in calculation of time^3
+ * (so time^3 is done by using 64 bit)
+ * and without the support of division of 64bit numbers
+ * (so all divisions are done by using 32 bit)
+ * also NOTE the unit of those veriables
+ * time = (t - K) / 2^bictcp_HZ
+ * c = bic_scale >> 10
+ * rtt = (srtt >> 3) / HZ
+ * !!! The following code does not have overflow problems,
+ * if the cwnd < 1 million packets !!!
+ */
+
+ t = (s32)(tcp_time_stamp - ca->epoch_start);
+ t += msecs_to_jiffies(ca->delay_min >> 3);
+ /* change the unit from HZ to bictcp_HZ */
+ t <<= BICTCP_HZ;
+ do_div(t, HZ);
+
+ if (t < ca->bic_K) /* t - K */
+ offs = ca->bic_K - t;
+ else
+ offs = t - ca->bic_K;
+
+ /* c/rtt * (t-K)^3 */
+ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+ if (t < ca->bic_K) /* below origin*/
+ bic_target = ca->bic_origin_point - delta;
+ else /* above origin*/
+ bic_target = ca->bic_origin_point + delta;
+
+ /* cubic function - calc bictcp_cnt*/
+ if (bic_target > cwnd) {
+ ca->cnt = cwnd / (bic_target - cwnd);
+ } else {
+ ca->cnt = 100 * cwnd; /* very small increment*/
+ }
+
+ /*
+ * The initial growth of cubic function may be too conservative
+ * when the available bandwidth is still unknown.
+ */
+ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+ ca->cnt = 20; /* increase cwnd 5% per RTT */
+
+tcp_friendliness:
+ /* TCP Friendly */
+ if (tcp_friendliness) {
+ u32 scale = beta_scale;
+
+ delta = (cwnd * scale) >> 3;
+ while (ca->ack_cnt > delta) { /* update tcp cwnd */
+ ca->ack_cnt -= delta;
+ ca->tcp_cwnd++;
+ }
+
+ if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
+ delta = ca->tcp_cwnd - cwnd;
+ max_cnt = cwnd / delta;
+ if (ca->cnt > max_cnt)
+ ca->cnt = max_cnt;
+ }
+ }
+
+ /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
+ * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
+ */
+ ca->cnt = max(ca->cnt, 2U);
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (hystart && after(ack, ca->end_seq))
+ bictcp_hystart_reset(sk);
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ bictcp_update(ca, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss) {
+ bictcp_reset(inet_csk_ca(sk));
+ bictcp_hystart_reset(sk);
+ }
+}
+
+static void hystart_update(struct sock *sk, u32 delay)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (ca->found & hystart_detect)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now = bictcp_clock();
+
+ /* first detection parameter - ack-train detection */
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+ ca->last_ack = now;
+ if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
+ ca->found |= HYSTART_ACK_TRAIN;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ /* obtain the minimum delay of more than sampling packets */
+ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
+ if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
+ ca->curr_rtt = delay;
+
+ ca->sample_cnt++;
+ } else {
+ if (ca->curr_rtt > ca->delay_min +
+ HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
+ ca->found |= HYSTART_DELAY;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 delay;
+
+ /* Some calls are for duplicates without timetamps */
+ if (rtt_us < 0)
+ return;
+
+ /* Discard delay samples right after fast recovery */
+ if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+ return;
+
+ delay = (rtt_us << 3) / USEC_PER_MSEC;
+ if (delay == 0)
+ delay = 1;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay)
+ ca->delay_min = delay;
+
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+ tp->snd_cwnd >= hystart_low_window)
+ hystart_update(sk, delay);
+}
+
+static struct tcp_congestion_ops cubictcp __read_mostly = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "cubic",
+};
+
+static int __init cubictcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+ /* Precompute a bunch of the scaling factors that are used per-packet
+ * based on SRTT of 100ms
+ */
+
+ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
+ / (BICTCP_BETA_SCALE - beta);
+
+ cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
+
+ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+ * so K = cubic_root( (wmax-cwnd)*rtt/c )
+ * the unit of K is bictcp_HZ=2^10, not HZ
+ *
+ * c = bic_scale >> 10
+ * rtt = 100ms
+ *
+ * the following code has been designed and tested for
+ * cwnd < 1 million packets
+ * RTT < 100 seconds
+ * HZ < 1,000,00 (corresponding to 10 nano-second)
+ */
+
+ /* 1/c * 2^2*bictcp_HZ * srtt */
+ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+ /* divide by bic_scale and by constant Srtt (100ms) */
+ do_div(cube_factor, bic_scale * 10);
+
+ return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.3");
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000..4c41c1287
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,345 @@
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ * - High burst tolerance (incast due to partition/aggregate)
+ * - Low latency (short flows, queries)
+ * - High throughput (continuous data updates, large file transfers)
+ * with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ * "Data Center TCP (DCTCP)", Data Center Networks session
+ * Proc. ACM SIGCOMM, New Delhi, 2010.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ * "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ * Proc. ACM SIGMETRICS, San Jose, 2011.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <dborkman@redhat.com>
+ * Florian Westphal <fw@strlen.de>
+ * Glenn Judd <glenn.judd@morganstanley.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+#define DCTCP_MAX_ALPHA 1024U
+
+struct dctcp {
+ u32 acked_bytes_ecn;
+ u32 acked_bytes_total;
+ u32 prior_snd_una;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state;
+ u32 delayed_ack_reserved;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+ "parameter for clamping alpha on loss");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+ ca->next_seq = tp->snd_nxt;
+
+ ca->acked_bytes_ecn = 0;
+ ca->acked_bytes_total = 0;
+}
+
+static void dctcp_init(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((tp->ecn_flags & TCP_ECN_OK) ||
+ (sk->sk_state == TCP_LISTEN ||
+ sk->sk_state == TCP_CLOSE)) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ ca->prior_snd_una = tp->snd_una;
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+
+ ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+ ca->delayed_ack_reserved = 0;
+ ca->ce_state = 0;
+
+ dctcp_reset(tp, ca);
+ return;
+ }
+
+ /* No ECN support? Fall back to Reno. Also need to clear
+ * ECT from sk since it is set during 3WHS for DCTCP.
+ */
+ inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+ INET_ECN_dontxmit(sk);
+}
+
+static u32 dctcp_ssthresh(struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+
+static void dctcp_ce_state_0_to_1(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=0 to CE=1 and delayed
+ * ACK has not sent yet.
+ */
+ if (!ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=0. */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 1;
+
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_ce_state_1_to_0(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=1 to CE=0 and delayed
+ * ACK has not sent yet.
+ */
+ if (ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=1. */
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 0;
+
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+ u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
+
+ /* If ack did not advance snd_una, count dupack as MSS size.
+ * If ack did update window, do not count it at all.
+ */
+ if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+ if (acked_bytes) {
+ ca->acked_bytes_total += acked_bytes;
+ ca->prior_snd_una = tp->snd_una;
+
+ if (flags & CA_ACK_ECE)
+ ca->acked_bytes_ecn += acked_bytes;
+ }
+
+ /* Expired RTT */
+ if (!before(tp->snd_una, ca->next_seq)) {
+ /* For avoiding denominator == 1. */
+ if (ca->acked_bytes_total == 0)
+ ca->acked_bytes_total = 1;
+
+ /* alpha = (1 - g) * alpha + g * F */
+ ca->dctcp_alpha = ca->dctcp_alpha -
+ (ca->dctcp_alpha >> dctcp_shift_g) +
+ (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
+ ca->acked_bytes_total;
+
+ if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
+ /* Clamp dctcp_alpha to max. */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+
+ dctcp_reset(tp, ca);
+ }
+}
+
+static void dctcp_state(struct sock *sk, u8 new_state)
+{
+ if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ /* If this extension is enabled, we clamp dctcp_alpha to
+ * max on packet loss; the motivation is that dctcp_alpha
+ * is an indicator to the extend of congestion and packet
+ * loss is an indicator of extreme congestion; setting
+ * this in practice turned out to be beneficial, and
+ * effectively assumes total congestion which reduces the
+ * window by half.
+ */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ }
+}
+
+static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ switch (ev) {
+ case CA_EVENT_DELAYED_ACK:
+ if (!ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 1;
+ break;
+ case CA_EVENT_NON_DELAYED_ACK:
+ if (ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 0;
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+ switch (ev) {
+ case CA_EVENT_ECN_IS_CE:
+ dctcp_ce_state_0_to_1(sk);
+ break;
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ce_state_1_to_0(sk);
+ break;
+ case CA_EVENT_DELAYED_ACK:
+ case CA_EVENT_NON_DELAYED_ACK:
+ dctcp_update_ack_reserved(sk, ev);
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+
+ /* Fill it also in case of VEGASINFO due to req struct limits.
+ * We can still correctly retrieve it later.
+ */
+ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ memset(info, 0, sizeof(struct tcp_dctcp_info));
+ if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+ info->dctcp.dctcp_enabled = 1;
+ info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+ info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+ info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
+ info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
+ }
+
+ *attr = INET_DIAG_DCTCPINFO;
+ return sizeof(*info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+ .init = dctcp_init,
+ .in_ack_event = dctcp_update_alpha,
+ .cwnd_event = dctcp_cwnd_event,
+ .ssthresh = dctcp_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .set_state = dctcp_state,
+ .get_info = dctcp_get_info,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .owner = THIS_MODULE,
+ .name = "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .get_info = dctcp_get_info,
+ .owner = THIS_MODULE,
+ .name = "dctcp-reno",
+};
+
+static int __init dctcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 000000000..79b34a0f4
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,68 @@
+/*
+ * tcp_diag.c Module for monitoring TCP transport protocols sockets.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_info *info = _info;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ r->idiag_rqueue = sk->sk_ack_backlog;
+ r->idiag_wqueue = sk->sk_max_ack_backlog;
+ } else {
+ r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ }
+ if (info)
+ tcp_get_info(sk, info);
+}
+
+static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+}
+
+static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler tcp_diag_handler = {
+ .dump = tcp_diag_dump,
+ .dump_one = tcp_diag_dump_one,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_type = IPPROTO_TCP,
+};
+
+static int __init tcp_diag_init(void)
+{
+ return inet_diag_register(&tcp_diag_handler);
+}
+
+static void __exit tcp_diag_exit(void)
+{
+ inet_diag_unregister(&tcp_diag_handler);
+}
+
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000..f9c0fb84e
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,313 @@
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <net/inetpeer.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
+
+struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+
+static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+
+void tcp_fastopen_init_key_once(bool publish)
+{
+ static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ /* tcp_fastopen_reset_cipher publishes the new context
+ * atomically, so we allow this race happening here.
+ *
+ * All call sites of tcp_fastopen_cookie_gen also check
+ * for a valid cookie, so this is an acceptable risk.
+ */
+ if (net_get_random_once(key, sizeof(key)) && publish)
+ tcp_fastopen_reset_cipher(key, sizeof(key));
+}
+
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+ struct tcp_fastopen_context *ctx =
+ container_of(head, struct tcp_fastopen_context, rcu);
+ crypto_free_cipher(ctx->tfm);
+ kfree(ctx);
+}
+
+int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+{
+ int err;
+ struct tcp_fastopen_context *ctx, *octx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
+
+ if (IS_ERR(ctx->tfm)) {
+ err = PTR_ERR(ctx->tfm);
+error: kfree(ctx);
+ pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+ return err;
+ }
+ err = crypto_cipher_setkey(ctx->tfm, key, len);
+ if (err) {
+ pr_err("TCP: TFO cipher key error: %d\n", err);
+ crypto_free_cipher(ctx->tfm);
+ goto error;
+ }
+ memcpy(ctx->key, key, len);
+
+ spin_lock(&tcp_fastopen_ctx_lock);
+
+ octx = rcu_dereference_protected(tcp_fastopen_ctx,
+ lockdep_is_held(&tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(tcp_fastopen_ctx, ctx);
+ spin_unlock(&tcp_fastopen_ctx_lock);
+
+ if (octx)
+ call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+ return err;
+}
+
+static bool __tcp_fastopen_cookie_gen(const void *path,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+ bool ok = false;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(tcp_fastopen_ctx);
+ if (ctx) {
+ crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ ok = true;
+ }
+ rcu_read_unlock();
+ return ok;
+}
+
+/* Generate the fastopen cookie by doing aes128 encryption on both
+ * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
+ * addresses. For the longer IPv6 addresses use CBC-MAC.
+ *
+ * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+ */
+static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ if (req->rsk_ops->family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(syn);
+
+ __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
+ return __tcp_fastopen_cookie_gen(path, foc);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (req->rsk_ops->family == AF_INET6) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+ struct tcp_fastopen_cookie tmp;
+
+ if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
+ return __tcp_fastopen_cookie_gen(buf, foc);
+ }
+ }
+#endif
+ return false;
+}
+
+static bool tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct tcp_sock *tp;
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ struct sock *child;
+ u32 end_seq;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (!child)
+ return false;
+
+ spin_lock(&queue->fastopenq->lock);
+ queue->fastopenq->qlen++;
+ spin_unlock(&queue->fastopenq->lock);
+
+ /* Initialize the child socket. Have to fix some values to take
+ * into account the child is a Fast Open socket and is created
+ * only out of the bits carried in the SYN packet.
+ */
+ tp = tcp_sk(child);
+
+ tp->fastopen_rsk = req;
+ tcp_rsk(req)->tfo_listener = true;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never
+ * scaled. So correct it appropriately.
+ */
+ tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+ /* Activate the retrans timer so that SYNACK can be retransmitted.
+ * The request socket is not added to the SYN table of the parent
+ * because it's been added to the accept queue directly.
+ */
+ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+ atomic_set(&req->rsk_refcnt, 1);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, child);
+
+ /* Now finish processing the fastopen child socket. */
+ inet_csk(child)->icsk_af_ops->rebuild_header(child);
+ tcp_init_congestion_control(child);
+ tcp_mtup_init(child);
+ tcp_init_metrics(child);
+ tcp_init_buffer_space(child);
+
+ /* Queue the data carried in the SYN packet. We need to first
+ * bump skb's refcnt because the caller will attempt to free it.
+ * Note that IPv6 might also have used skb_get() trick
+ * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
+ * So we need to eventually get a clone of the packet,
+ * before inserting it in sk_receive_queue.
+ *
+ * XXX (TFO) - we honor a zero-payload TFO request for now,
+ * (any reason not to?) but no need to queue the skb since
+ * there is no data. How about SYN+FIN?
+ */
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+ if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
+ struct sk_buff *skb2;
+
+ if (unlikely(skb_shared(skb)))
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ else
+ skb2 = skb_get(skb);
+
+ if (likely(skb2)) {
+ skb_dst_drop(skb2);
+ __skb_pull(skb2, tcp_hdrlen(skb));
+ skb_set_owner_r(skb2, child);
+ __skb_queue_tail(&child->sk_receive_queue, skb2);
+ tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
+ } else {
+ end_seq = TCP_SKB_CB(skb)->seq + 1;
+ }
+ }
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(child);
+ sock_put(child);
+ WARN_ON(!req->sk);
+ return true;
+}
+
+static bool tcp_fastopen_queue_check(struct sock *sk)
+{
+ struct fastopen_queue *fastopenq;
+
+ /* Make sure the listener has enabled fastopen, and we don't
+ * exceed the max # of pending TFO requests allowed before trying
+ * to validating the cookie in order to avoid burning CPU cycles
+ * unnecessarily.
+ *
+ * XXX (TFO) - The implication of checking the max_qlen before
+ * processing a cookie request is that clients can't differentiate
+ * between qlen overflow causing Fast Open to be disabled
+ * temporarily vs a server not supporting Fast Open at all.
+ */
+ fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (!fastopenq || fastopenq->max_qlen == 0)
+ return false;
+
+ if (fastopenq->qlen >= fastopenq->max_qlen) {
+ struct request_sock *req1;
+ spin_lock(&fastopenq->lock);
+ req1 = fastopenq->rskq_rst_head;
+ if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
+ spin_unlock(&fastopenq->lock);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ return false;
+ }
+ fastopenq->rskq_rst_head = req1->dl_next;
+ fastopenq->qlen--;
+ spin_unlock(&fastopenq->lock);
+ reqsk_put(req1);
+ }
+ return true;
+}
+
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
+{
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+ bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+
+ if (foc->len == 0) /* Client requests a cookie */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+
+ if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ (syn_data || foc->len >= 0) &&
+ tcp_fastopen_queue_check(sk))) {
+ foc->len = -1;
+ return false;
+ }
+
+ if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ goto fastopen;
+
+ if (foc->len >= 0 && /* Client presents or requests a cookie */
+ tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
+ foc->len == valid_foc.len &&
+ !memcmp(foc->val, valid_foc.val, foc->len)) {
+ /* Cookie is valid. Create a (full) child socket to accept
+ * the data in SYN before returning a SYN-ACK to ack the
+ * data. If we fail to create the socket, fall back and
+ * ack the ISN only but includes the same cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to send
+ * data in SYN_RECV state.
+ */
+fastopen:
+ if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ foc->len = -1;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return true;
+ }
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ } else if (foc->len > 0) /* Client presents an invalid cookie */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ return false;
+}
+EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000..882c08aae
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,185 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+ unsigned int cwnd;
+ unsigned int md;
+} hstcp_aimd_vals[] = {
+ { 38, 128, /* 0.50 */ },
+ { 118, 112, /* 0.44 */ },
+ { 221, 104, /* 0.41 */ },
+ { 347, 98, /* 0.38 */ },
+ { 495, 93, /* 0.37 */ },
+ { 663, 89, /* 0.35 */ },
+ { 851, 86, /* 0.34 */ },
+ { 1058, 83, /* 0.33 */ },
+ { 1284, 81, /* 0.32 */ },
+ { 1529, 78, /* 0.31 */ },
+ { 1793, 76, /* 0.30 */ },
+ { 2076, 74, /* 0.29 */ },
+ { 2378, 72, /* 0.28 */ },
+ { 2699, 71, /* 0.28 */ },
+ { 3039, 69, /* 0.27 */ },
+ { 3399, 68, /* 0.27 */ },
+ { 3778, 66, /* 0.26 */ },
+ { 4177, 65, /* 0.26 */ },
+ { 4596, 64, /* 0.25 */ },
+ { 5036, 62, /* 0.25 */ },
+ { 5497, 61, /* 0.24 */ },
+ { 5979, 60, /* 0.24 */ },
+ { 6483, 59, /* 0.23 */ },
+ { 7009, 58, /* 0.23 */ },
+ { 7558, 57, /* 0.22 */ },
+ { 8130, 56, /* 0.22 */ },
+ { 8726, 55, /* 0.22 */ },
+ { 9346, 54, /* 0.21 */ },
+ { 9991, 53, /* 0.21 */ },
+ { 10661, 52, /* 0.21 */ },
+ { 11358, 52, /* 0.20 */ },
+ { 12082, 51, /* 0.20 */ },
+ { 12834, 50, /* 0.20 */ },
+ { 13614, 49, /* 0.19 */ },
+ { 14424, 48, /* 0.19 */ },
+ { 15265, 48, /* 0.19 */ },
+ { 16137, 47, /* 0.19 */ },
+ { 17042, 46, /* 0.18 */ },
+ { 17981, 45, /* 0.18 */ },
+ { 18955, 45, /* 0.18 */ },
+ { 19965, 44, /* 0.17 */ },
+ { 21013, 43, /* 0.17 */ },
+ { 22101, 43, /* 0.17 */ },
+ { 23230, 42, /* 0.17 */ },
+ { 24402, 41, /* 0.16 */ },
+ { 25618, 41, /* 0.16 */ },
+ { 26881, 40, /* 0.16 */ },
+ { 28193, 39, /* 0.16 */ },
+ { 29557, 39, /* 0.15 */ },
+ { 30975, 38, /* 0.15 */ },
+ { 32450, 38, /* 0.15 */ },
+ { 33986, 37, /* 0.15 */ },
+ { 35586, 36, /* 0.14 */ },
+ { 37253, 36, /* 0.14 */ },
+ { 38992, 35, /* 0.14 */ },
+ { 40808, 35, /* 0.14 */ },
+ { 42707, 34, /* 0.13 */ },
+ { 44694, 33, /* 0.13 */ },
+ { 46776, 33, /* 0.13 */ },
+ { 48961, 32, /* 0.13 */ },
+ { 51258, 32, /* 0.13 */ },
+ { 53677, 31, /* 0.12 */ },
+ { 56230, 30, /* 0.12 */ },
+ { 58932, 30, /* 0.12 */ },
+ { 61799, 29, /* 0.12 */ },
+ { 64851, 28, /* 0.11 */ },
+ { 68113, 28, /* 0.11 */ },
+ { 71617, 27, /* 0.11 */ },
+ { 75401, 26, /* 0.10 */ },
+ { 79517, 26, /* 0.10 */ },
+ { 84035, 25, /* 0.10 */ },
+ { 89053, 24, /* 0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+ u32 ai;
+};
+
+static void hstcp_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+
+ ca->ai = 0;
+
+ /* Ensure the MD arithmetic works. This is somewhat pedantic,
+ * since I don't think we will see a cwnd this large. :) */
+ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else {
+ /* Update AIMD parameters.
+ *
+ * We want to guarantee that:
+ * hstcp_aimd_vals[ca->ai-1].cwnd <
+ * snd_cwnd <=
+ * hstcp_aimd_vals[ca->ai].cwnd
+ */
+ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ ca->ai < HSTCP_AIMD_MAX - 1)
+ ca->ai++;
+ } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+ while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
+ ca->ai--;
+ }
+
+ /* Do additive increase */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ /* cwnd = cwnd + a(w) / cwnd */
+ tp->snd_cwnd_cnt += ca->ai + 1;
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ tp->snd_cwnd_cnt -= tp->snd_cwnd;
+ tp->snd_cwnd++;
+ }
+ }
+ }
+}
+
+static u32 hstcp_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ /* Do multiplicative decrease */
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
+ .init = hstcp_init,
+ .ssthresh = hstcp_ssthresh,
+ .cong_avoid = hstcp_cong_avoid,
+
+ .owner = THIS_MODULE,
+ .name = "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000..58469fff6
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,317 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ * "H-TCP: TCP for high-speed and long-distance networks"
+ * Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
+#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
+#define BETA_MAX 102 /* 0.8 with shift << 7 */
+
+static int use_rtt_scaling __read_mostly = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch __read_mostly = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+ u32 alpha; /* Fixed point arith, << 7 */
+ u8 beta; /* Fixed point arith, << 7 */
+ u8 modeswitch; /* Delay modeswitch
+ until we had at least one congestion event */
+ u16 pkts_acked;
+ u32 packetcount;
+ u32 minRTT;
+ u32 maxRTT;
+ u32 last_cong; /* Time since last congestion event end */
+ u32 undo_last_cong;
+
+ u32 undo_maxRTT;
+ u32 undo_old_maxB;
+
+ /* Bandwidth estimation */
+ u32 minB;
+ u32 maxB;
+ u32 old_maxB;
+ u32 Bi;
+ u32 lasttime;
+};
+
+static inline u32 htcp_cong_time(const struct htcp *ca)
+{
+ return jiffies - ca->last_cong;
+}
+
+static inline u32 htcp_ccount(const struct htcp *ca)
+{
+ return htcp_cong_time(ca) / ca->minRTT;
+}
+
+static inline void htcp_reset(struct htcp *ca)
+{
+ ca->undo_last_cong = ca->last_cong;
+ ca->undo_maxRTT = ca->maxRTT;
+ ca->undo_old_maxB = ca->old_maxB;
+
+ ca->last_cong = jiffies;
+}
+
+static u32 htcp_cwnd_undo(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (ca->undo_last_cong) {
+ ca->last_cong = ca->undo_last_cong;
+ ca->maxRTT = ca->undo_maxRTT;
+ ca->old_maxB = ca->undo_old_maxB;
+ ca->undo_last_cong = 0;
+ }
+
+ return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
+}
+
+static inline void measure_rtt(struct sock *sk, u32 srtt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ /* keep track of minimum RTT seen so far, minRTT is zero at first */
+ if (ca->minRTT > srtt || !ca->minRTT)
+ ca->minRTT = srtt;
+
+ /* max RTT */
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
+ if (ca->maxRTT < ca->minRTT)
+ ca->maxRTT = ca->minRTT;
+ if (ca->maxRTT < srtt &&
+ srtt <= ca->maxRTT + msecs_to_jiffies(20))
+ ca->maxRTT = srtt;
+ }
+}
+
+static void measure_achieved_throughput(struct sock *sk,
+ u32 pkts_acked, s32 rtt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+
+ if (icsk->icsk_ca_state == TCP_CA_Open)
+ ca->pkts_acked = pkts_acked;
+
+ if (rtt > 0)
+ measure_rtt(sk, usecs_to_jiffies(rtt));
+
+ if (!use_bandwidth_switch)
+ return;
+
+ /* achieved throughput calculations */
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
+ ca->packetcount = 0;
+ ca->lasttime = now;
+ return;
+ }
+
+ ca->packetcount += pkts_acked;
+
+ if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+ now - ca->lasttime >= ca->minRTT &&
+ ca->minRTT > 0) {
+ __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
+
+ if (htcp_ccount(ca) <= 3) {
+ /* just after backoff */
+ ca->minB = ca->maxB = ca->Bi = cur_Bi;
+ } else {
+ ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
+ if (ca->Bi > ca->maxB)
+ ca->maxB = ca->Bi;
+ if (ca->minB > ca->maxB)
+ ca->minB = ca->maxB;
+ }
+ ca->packetcount = 0;
+ ca->lasttime = now;
+ }
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+ if (use_bandwidth_switch) {
+ u32 maxB = ca->maxB;
+ u32 old_maxB = ca->old_maxB;
+
+ ca->old_maxB = ca->maxB;
+ if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 0;
+ return;
+ }
+ }
+
+ if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
+ ca->beta = (minRTT << 7) / maxRTT;
+ if (ca->beta < BETA_MIN)
+ ca->beta = BETA_MIN;
+ else if (ca->beta > BETA_MAX)
+ ca->beta = BETA_MAX;
+ } else {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 1;
+ }
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+ u32 minRTT = ca->minRTT;
+ u32 factor = 1;
+ u32 diff = htcp_cong_time(ca);
+
+ if (diff > HZ) {
+ diff -= HZ;
+ factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
+ }
+
+ if (use_rtt_scaling && minRTT) {
+ u32 scale = (HZ << 3) / (10 * minRTT);
+
+ /* clamping ratio to interval [0.5,10]<<3 */
+ scale = min(max(scale, 1U << 2), 10U << 3);
+ factor = (factor << 3) / scale;
+ if (!factor)
+ factor = 1;
+ }
+
+ ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
+ if (!ca->alpha)
+ ca->alpha = ALPHA_BASE;
+}
+
+/*
+ * After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct sock *sk)
+{
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 minRTT = ca->minRTT;
+ u32 maxRTT = ca->maxRTT;
+
+ htcp_beta_update(ca, minRTT, maxRTT);
+ htcp_alpha_update(ca);
+
+ /* add slowly fading memory for maxRTT to accommodate routing changes */
+ if (minRTT > 0 && maxRTT > minRTT)
+ ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
+}
+
+static u32 htcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct htcp *ca = inet_csk_ca(sk);
+
+ htcp_param_update(sk);
+ return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else {
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+ */
+ if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ htcp_alpha_update(ca);
+ } else
+ tp->snd_cwnd_cnt += ca->pkts_acked;
+
+ ca->pkts_acked = 1;
+ }
+}
+
+static void htcp_init(struct sock *sk)
+{
+ struct htcp *ca = inet_csk_ca(sk);
+
+ memset(ca, 0, sizeof(struct htcp));
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_MIN;
+ ca->pkts_acked = 1;
+ ca->last_cong = jiffies;
+}
+
+static void htcp_state(struct sock *sk, u8 new_state)
+{
+ switch (new_state) {
+ case TCP_CA_Open:
+ {
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (ca->undo_last_cong) {
+ ca->last_cong = jiffies;
+ ca->undo_last_cong = 0;
+ }
+ }
+ break;
+ case TCP_CA_CWR:
+ case TCP_CA_Recovery:
+ case TCP_CA_Loss:
+ htcp_reset(inet_csk_ca(sk));
+ break;
+ }
+}
+
+static struct tcp_congestion_ops htcp __read_mostly = {
+ .init = htcp_init,
+ .ssthresh = htcp_recalc_ssthresh,
+ .cong_avoid = htcp_cong_avoid,
+ .set_state = htcp_state,
+ .undo_cwnd = htcp_cwnd_undo,
+ .pkts_acked = measure_achieved_throughput,
+ .owner = THIS_MODULE,
+ .name = "htcp",
+};
+
+static int __init htcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
+ BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+ return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000..f963b274f
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,192 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ * for Heterogeneous Networks",
+ * International Journal on satellite Communications,
+ * September 2004
+ * Daniele Lacamera
+ * root at danielinux.net
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+ bool hybla_en;
+ u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+ u32 rho; /* Rho parameter, integer part */
+ u32 rho2; /* Rho * Rho, integer part */
+ u32 rho_3ls; /* Rho parameter, <<3 */
+ u32 rho2_7ls; /* Rho^2, <<7 */
+ u32 minrtt_us; /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct sock *sk)
+{
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->rho_3ls = max_t(u32,
+ tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+ 8U);
+ ca->rho = ca->rho_3ls >> 3;
+ ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+ ca->rho2 = ca->rho2_7ls >> 7;
+}
+
+static void hybla_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->rho = 0;
+ ca->rho2 = 0;
+ ca->rho_3ls = 0;
+ ca->rho2_7ls = 0;
+ ca->snd_cwnd_cents = 0;
+ ca->hybla_en = true;
+ tp->snd_cwnd = 2;
+ tp->snd_cwnd_clamp = 65535;
+
+ /* 1st Rho measurement based on initial srtt */
+ hybla_recalc_param(sk);
+
+ /* set minimum rtt as this is the 1st ever seen */
+ ca->minrtt_us = tp->srtt_us;
+ tp->snd_cwnd = ca->rho;
+}
+
+static void hybla_state(struct sock *sk, u8 ca_state)
+{
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+ static const u32 fractions[] = {
+ 128, 139, 152, 165, 181, 197, 215, 234,
+ };
+
+ return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ * o Recalc Hybla parameters if min_rtt has changed
+ * o Give cwnd a new value based on the model proposed
+ * o remember increments <1
+ */
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
+ u32 increment, odd, rho_fractions;
+ int is_slowstart = 0;
+
+ /* Recalculate rho only if this srtt is the lowest */
+ if (tp->srtt_us < ca->minrtt_us) {
+ hybla_recalc_param(sk);
+ ca->minrtt_us = tp->srtt_us;
+ }
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (!ca->hybla_en) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
+
+ if (ca->rho == 0)
+ hybla_recalc_param(sk);
+
+ rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+ if (tp->snd_cwnd < tp->snd_ssthresh) {
+ /*
+ * slow start
+ * INC = 2^RHO - 1
+ * This is done by splitting the rho parameter
+ * into 2 parts: an integer part and a fraction part.
+ * Inrement<<7 is estimated by doing:
+ * [2^(int+fract)]<<7
+ * that is equal to:
+ * (2^int) * [(2^fract) <<7]
+ * 2^int is straightly computed as 1<<int,
+ * while we will use hybla_slowstart_fraction_increment() to
+ * calculate 2^fract in a <<7 value.
+ */
+ is_slowstart = 1;
+ increment = ((1 << min(ca->rho, 16U)) *
+ hybla_fraction(rho_fractions)) - 128;
+ } else {
+ /*
+ * congestion avoidance
+ * INC = RHO^2 / W
+ * as long as increment is estimated as (rho<<7)/window
+ * it already is <<7 and we can easily count its fractions.
+ */
+ increment = ca->rho2_7ls / tp->snd_cwnd;
+ if (increment < 128)
+ tp->snd_cwnd_cnt++;
+ }
+
+ odd = increment % 128;
+ tp->snd_cwnd += increment >> 7;
+ ca->snd_cwnd_cents += odd;
+
+ /* check when fractions goes >=128 and increase cwnd by 1. */
+ while (ca->snd_cwnd_cents >= 128) {
+ tp->snd_cwnd++;
+ ca->snd_cwnd_cents -= 128;
+ tp->snd_cwnd_cnt = 0;
+ }
+ /* check when cwnd has not been incremented for a while */
+ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ }
+ /* clamp down slowstart cwnd to ssthresh value. */
+ if (is_slowstart)
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
+ .init = hybla_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = hybla_cong_avoid,
+ .set_state = hybla_state,
+
+ .owner = THIS_MODULE,
+ .name = "hybla"
+};
+
+static int __init hybla_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
new file mode 100644
index 000000000..f71002e4d
--- /dev/null
+++ b/net/ipv4/tcp_illinois.c
@@ -0,0 +1,355 @@
+/*
+ * TCP Illinois congestion control.
+ * Home page:
+ * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ *
+ * The algorithm is described in:
+ * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
+ * for High-Speed Networks"
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
+ *
+ * Implemented from description in paper and ns-2 simulation.
+ * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <asm/div64.h>
+#include <net/tcp.h>
+
+#define ALPHA_SHIFT 7
+#define ALPHA_SCALE (1u<<ALPHA_SHIFT)
+#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
+#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
+#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
+#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
+
+#define BETA_SHIFT 6
+#define BETA_SCALE (1u<<BETA_SHIFT)
+#define BETA_MIN (BETA_SCALE/8) /* 0.125 */
+#define BETA_MAX (BETA_SCALE/2) /* 0.5 */
+#define BETA_BASE BETA_MAX
+
+static int win_thresh __read_mostly = 15;
+module_param(win_thresh, int, 0);
+MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
+
+static int theta __read_mostly = 5;
+module_param(theta, int, 0);
+MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
+
+/* TCP Illinois Parameters */
+struct illinois {
+ u64 sum_rtt; /* sum of rtt's measured within last rtt */
+ u16 cnt_rtt; /* # of rtts measured within last rtt */
+ u32 base_rtt; /* min of all rtt in usec */
+ u32 max_rtt; /* max of all rtt in usec */
+ u32 end_seq; /* right edge of current RTT */
+ u32 alpha; /* Additive increase */
+ u32 beta; /* Muliplicative decrease */
+ u16 acked; /* # packets acked by current ACK */
+ u8 rtt_above; /* average rtt has gone above threshold */
+ u8 rtt_low; /* # of rtts measurements below threshold */
+};
+
+static void rtt_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->end_seq = tp->snd_nxt;
+ ca->cnt_rtt = 0;
+ ca->sum_rtt = 0;
+
+ /* TODO: age max_rtt? */
+}
+
+static void tcp_illinois_init(struct sock *sk)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->alpha = ALPHA_MAX;
+ ca->beta = BETA_BASE;
+ ca->base_rtt = 0x7fffffff;
+ ca->max_rtt = 0;
+
+ ca->acked = 0;
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+
+ rtt_reset(sk);
+}
+
+/* Measure RTT for each ack. */
+static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->acked = pkts_acked;
+
+ /* dup ack, no rtt sample */
+ if (rtt < 0)
+ return;
+
+ /* ignore bogus values, this prevents wraparound in alpha math */
+ if (rtt > RTT_MAX)
+ rtt = RTT_MAX;
+
+ /* keep track of minimum RTT seen so far */
+ if (ca->base_rtt > rtt)
+ ca->base_rtt = rtt;
+
+ /* and max */
+ if (ca->max_rtt < rtt)
+ ca->max_rtt = rtt;
+
+ ++ca->cnt_rtt;
+ ca->sum_rtt += rtt;
+}
+
+/* Maximum queuing delay */
+static inline u32 max_delay(const struct illinois *ca)
+{
+ return ca->max_rtt - ca->base_rtt;
+}
+
+/* Average queuing delay */
+static inline u32 avg_delay(const struct illinois *ca)
+{
+ u64 t = ca->sum_rtt;
+
+ do_div(t, ca->cnt_rtt);
+ return t - ca->base_rtt;
+}
+
+/*
+ * Compute value of alpha used for additive increase.
+ * If small window then use 1.0, equivalent to Reno.
+ *
+ * For larger windows, adjust based on average delay.
+ * A. If average delay is at minimum (we are uncongested),
+ * then use large alpha (10.0) to increase faster.
+ * B. If average delay is at maximum (getting congested)
+ * then use small alpha (0.3)
+ *
+ * The result is a convex window growth curve.
+ */
+static u32 alpha(struct illinois *ca, u32 da, u32 dm)
+{
+ u32 d1 = dm / 100; /* Low threshold */
+
+ if (da <= d1) {
+ /* If never got out of low delay zone, then use max */
+ if (!ca->rtt_above)
+ return ALPHA_MAX;
+
+ /* Wait for 5 good RTT's before allowing alpha to go alpha max.
+ * This prevents one good RTT from causing sudden window increase.
+ */
+ if (++ca->rtt_low < theta)
+ return ca->alpha;
+
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+ return ALPHA_MAX;
+ }
+
+ ca->rtt_above = 1;
+
+ /*
+ * Based on:
+ *
+ * (dm - d1) amin amax
+ * k1 = -------------------
+ * amax - amin
+ *
+ * (dm - d1) amin
+ * k2 = ---------------- - d1
+ * amax - amin
+ *
+ * k1
+ * alpha = ----------
+ * k2 + da
+ */
+
+ dm -= d1;
+ da -= d1;
+ return (dm * ALPHA_MAX) /
+ (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
+}
+
+/*
+ * Beta used for multiplicative decrease.
+ * For small window sizes returns same value as Reno (0.5)
+ *
+ * If delay is small (10% of max) then beta = 1/8
+ * If delay is up to 80% of max then beta = 1/2
+ * In between is a linear function
+ */
+static u32 beta(u32 da, u32 dm)
+{
+ u32 d2, d3;
+
+ d2 = dm / 10;
+ if (da <= d2)
+ return BETA_MIN;
+
+ d3 = (8 * dm) / 10;
+ if (da >= d3 || d3 <= d2)
+ return BETA_MAX;
+
+ /*
+ * Based on:
+ *
+ * bmin d3 - bmax d2
+ * k3 = -------------------
+ * d3 - d2
+ *
+ * bmax - bmin
+ * k4 = -------------
+ * d3 - d2
+ *
+ * b = k3 + k4 da
+ */
+ return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
+ / (d3 - d2);
+}
+
+/* Update alpha and beta values once per RTT */
+static void update_params(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (tp->snd_cwnd < win_thresh) {
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_BASE;
+ } else if (ca->cnt_rtt > 0) {
+ u32 dm = max_delay(ca);
+ u32 da = avg_delay(ca);
+
+ ca->alpha = alpha(ca, da, dm);
+ ca->beta = beta(da, dm);
+ }
+
+ rtt_reset(sk);
+}
+
+/*
+ * In case of loss, reset to default values
+ */
+static void tcp_illinois_state(struct sock *sk, u8 new_state)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_BASE;
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+ rtt_reset(sk);
+ }
+}
+
+/*
+ * Increase window in response to successful acknowledgment.
+ */
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (after(ack, ca->end_seq))
+ update_params(sk);
+
+ /* RFC2861 only increase cwnd if fully utilized */
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* In slow start */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+
+ else {
+ u32 delta;
+
+ /* snd_cwnd_cnt is # of packets since last cwnd increment */
+ tp->snd_cwnd_cnt += ca->acked;
+ ca->acked = 1;
+
+ /* This is close approximation of:
+ * tp->snd_cwnd += alpha/tp->snd_cwnd
+ */
+ delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
+ if (delta >= tp->snd_cwnd) {
+ tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
+ (u32)tp->snd_cwnd_clamp);
+ tp->snd_cwnd_cnt = 0;
+ }
+ }
+}
+
+static u32 tcp_illinois_ssthresh(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ /* Multiplicative decrease */
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+ info->vegas.tcpv_minrtt = ca->base_rtt;
+ info->vegas.tcpv_rtt = 0;
+
+ if (info->vegas.tcpv_rttcnt > 0) {
+ u64 t = ca->sum_rtt;
+
+ do_div(t, info->vegas.tcpv_rttcnt);
+ info->vegas.tcpv_rtt = t;
+ }
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
+ .init = tcp_illinois_init,
+ .ssthresh = tcp_illinois_ssthresh,
+ .cong_avoid = tcp_illinois_cong_avoid,
+ .set_state = tcp_illinois_state,
+ .get_info = tcp_illinois_info,
+ .pkts_acked = tcp_illinois_acked,
+
+ .owner = THIS_MODULE,
+ .name = "illinois",
+};
+
+static int __init tcp_illinois_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_illinois);
+}
+
+static void __exit tcp_illinois_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_illinois);
+}
+
+module_init(tcp_illinois_register);
+module_exit(tcp_illinois_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Illinois");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 000000000..8e5d1bcbd
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,6299 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ * Pedro Roque : Fast Retransmit/Recovery.
+ * Two receive queues.
+ * Retransmit queue handled by TCP.
+ * Better retransmit timer handling.
+ * New congestion avoidance.
+ * Header prediction.
+ * Variable renaming.
+ *
+ * Eric : Fast Retransmit.
+ * Randy Scott : MSS option defines.
+ * Eric Schenk : Fixes to slow start algorithm.
+ * Eric Schenk : Yet another double ACK bug.
+ * Eric Schenk : Delayed ACK bug fixes.
+ * Eric Schenk : Floyd style fast retrans war avoidance.
+ * David S. Miller : Don't allow zero congestion window.
+ * Eric Schenk : Fix retransmitter so that it sends
+ * next packet on ack of previous packet.
+ * Andi Kleen : Moved open_request checking here
+ * and process RSTs for open_requests.
+ * Andi Kleen : Better prune_queue, and other fixes.
+ * Andrey Savochkin: Fix RTT measurements in the presence of
+ * timestamps.
+ * Andrey Savochkin: Check sequence numbers correctly when
+ * removing SACKs due to in sequence incoming
+ * data segments.
+ * Andi Kleen: Make sure we never ack data there is not
+ * enough room for. Also make this condition
+ * a fatal error if it might still happen.
+ * Andi Kleen: Add tcp_measure_rcv_mss to make
+ * connections with MSS<min(MTU,ann. MSS)
+ * work without delayed acks.
+ * Andi Kleen: Process packets with PSH set in the
+ * fast path.
+ * J Hadi Salim: ECN support
+ * Andrei Gurtov,
+ * Pasi Sarolahti,
+ * Panu Kuhlberg: Experimental audit of TCP (re)transmission
+ * engine. Lots of bugs are found.
+ * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/kernel.h>
+#include <linux/prefetch.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+#include <linux/errqueue.h>
+
+int sysctl_tcp_timestamps __read_mostly = 1;
+#ifdef CONFIG_TCP_STEALTH
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
+#endif
+int sysctl_tcp_window_scaling __read_mostly = 1;
+int sysctl_tcp_sack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_max_reordering __read_mostly = 300;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
+int sysctl_tcp_dsack __read_mostly = 1;
+int sysctl_tcp_app_win __read_mostly = 31;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
+
+int sysctl_tcp_stdurg __read_mostly;
+int sysctl_tcp_rfc1337 __read_mostly;
+int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
+int sysctl_tcp_frto __read_mostly = 2;
+
+int sysctl_tcp_thin_dupack __read_mostly;
+
+int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
+int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+
+#define FLAG_DATA 0x01 /* Incoming frame contained data. */
+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
+#define FLAG_DATA_SACKED 0x20 /* New SACK. */
+#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
+#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
+
+#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
+
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+/* Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ */
+static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const unsigned int lss = icsk->icsk_ack.last_seg_size;
+ unsigned int len;
+
+ icsk->icsk_ack.last_seg_size = 0;
+
+ /* skb->len may jitter because of SACKs, even if peer
+ * sends good full-sized frames.
+ */
+ len = skb_shinfo(skb)->gso_size ? : skb->len;
+ if (len >= icsk->icsk_ack.rcv_mss) {
+ icsk->icsk_ack.rcv_mss = len;
+ } else {
+ /* Otherwise, we make more careful check taking into account,
+ * that SACKs block is variable.
+ *
+ * "len" is invariant segment length, including TCP header.
+ */
+ len += skb->data - skb_transport_header(skb);
+ if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
+ /* If PSH is not set, packet should be
+ * full sized, provided peer TCP is not badly broken.
+ * This observation (if it is correct 8)) allows
+ * to handle super-low mtu links fairly.
+ */
+ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+ !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
+ /* Subtract also invariant (if peer is RFC compliant),
+ * tcp header plus fixed timestamp option length.
+ * Resulting "len" is MSS free of SACK jitter.
+ */
+ len -= tcp_sk(sk)->tcp_header_len;
+ icsk->icsk_ack.last_seg_size = len;
+ if (len == lss) {
+ icsk->icsk_ack.rcv_mss = len;
+ return;
+ }
+ }
+ if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ }
+}
+
+static void tcp_incr_quickack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+
+ if (quickacks == 0)
+ quickacks = 2;
+ if (quickacks > icsk->icsk_ack.quick)
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+}
+
+static void tcp_enter_quickack_mode(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+}
+
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+
+static inline bool tcp_in_quickack_mode(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+}
+
+static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
+{
+ if (tp->ecn_flags & TCP_ECN_OK)
+ tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ if (tcp_hdr(skb)->cwr)
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
+{
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ /* Funny extension: if ECT is not set on a segment,
+ * and we already seen ECT on a previous segment,
+ * it is probably a retransmit.
+ */
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ tcp_enter_quickack_mode((struct sock *)tp);
+ break;
+ case INET_ECN_CE:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ /* Better not delay acks, sender can have a very low cwnd */
+ tcp_enter_quickack_mode((struct sock *)tp);
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ }
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
+ default:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
+ }
+}
+
+static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ if (tp->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(tp, skb);
+}
+
+static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+{
+ if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
+ return true;
+ return false;
+}
+
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+
+static void tcp_sndbuf_expand(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int sndmem, per_mss;
+ u32 nr_segs;
+
+ /* Worst case is non GSO/TSO : each frame consumes one skb
+ * and skb->head is kmalloced using power of two area of memory
+ */
+ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+ MAX_TCP_HEADER +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ per_mss = roundup_pow_of_two(per_mss) +
+ SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+ /* Fast Recovery (RFC 5681 3.2) :
+ * Cubic needs 1.7 factor, rounded to 2 to include
+ * extra cushion (application might react slowly to POLLOUT)
+ */
+ sndmem = 2 * nr_segs * per_mss;
+
+ if (sk->sk_sndbuf < sndmem)
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+}
+
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ * requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ * of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spaghetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+
+/* Slow part of check#2. */
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ /* Optimize this! */
+ int truesize = tcp_win_from_space(skb->truesize) >> 1;
+ int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
+
+ while (tp->rcv_ssthresh <= window) {
+ if (truesize <= skb->len)
+ return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
+
+ truesize >>= 1;
+ window >>= 1;
+ }
+ return 0;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Check #1 */
+ if (tp->rcv_ssthresh < tp->window_clamp &&
+ (int)tp->rcv_ssthresh < tcp_space(sk) &&
+ !sk_under_memory_pressure(sk)) {
+ int incr;
+
+ /* Check #2. Increase window, if skb with such overhead
+ * will fit to rcvbuf in future.
+ */
+ if (tcp_win_from_space(skb->truesize) <= skb->len)
+ incr = 2 * tp->advmss;
+ else
+ incr = __tcp_grow_window(sk, skb);
+
+ if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
+ tp->window_clamp);
+ inet_csk(sk)->icsk_ack.quick |= 1;
+ }
+ }
+}
+
+/* 3. Tuning rcvbuf, when connection enters established state. */
+static void tcp_fixup_rcvbuf(struct sock *sk)
+{
+ u32 mss = tcp_sk(sk)->advmss;
+ int rcvmem;
+
+ rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
+ tcp_default_init_rwnd(mss);
+
+ /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+ * Allow enough cushion so that sender is not limited by our window
+ */
+ if (sysctl_tcp_moderate_rcvbuf)
+ rcvmem <<= 2;
+
+ if (sk->sk_rcvbuf < rcvmem)
+ sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
+}
+
+/* 4. Try to fixup all. It is made immediately after connection enters
+ * established state.
+ */
+void tcp_init_buffer_space(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int maxwin;
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ tcp_fixup_rcvbuf(sk);
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+ tcp_sndbuf_expand(sk);
+
+ tp->rcvq_space.space = tp->rcv_wnd;
+ tp->rcvq_space.time = tcp_time_stamp;
+ tp->rcvq_space.seq = tp->copied_seq;
+
+ maxwin = tcp_full_space(sk);
+
+ if (tp->window_clamp >= maxwin) {
+ tp->window_clamp = maxwin;
+
+ if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+ tp->window_clamp = max(maxwin -
+ (maxwin >> sysctl_tcp_app_win),
+ 4 * tp->advmss);
+ }
+
+ /* Force reservation of one segment. */
+ if (sysctl_tcp_app_win &&
+ tp->window_clamp > 2 * tp->advmss &&
+ tp->window_clamp + tp->advmss > maxwin)
+ tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* 5. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_ack.quick = 0;
+
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+ !sk_under_memory_pressure(sk) &&
+ sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
+ }
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
+}
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+ hint = min(hint, tp->rcv_wnd / 2);
+ hint = min(hint, TCP_MSS_DEFAULT);
+ hint = max(hint, TCP_MIN_MSS);
+
+ inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
+ *
+ * More detail on this code can be found at
+ * <http://staff.psc.edu/jheffner/>,
+ * though this reference is out of date. A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+ u32 new_sample = tp->rcv_rtt_est.rtt;
+ long m = sample;
+
+ if (m == 0)
+ m = 1;
+
+ if (new_sample != 0) {
+ /* If we sample in larger samples in the non-timestamp
+ * case, we could grossly overestimate the RTT especially
+ * with chatty applications or bulk transfer apps which
+ * are stalled on filesystem I/O.
+ *
+ * Also, since we are only going for a minimum in the
+ * non-timestamp case, we do not smooth things out
+ * else with timestamps disabled convergence takes too
+ * long.
+ */
+ if (!win_dep) {
+ m -= (new_sample >> 3);
+ new_sample += m;
+ } else {
+ m <<= 3;
+ if (m < new_sample)
+ new_sample = m;
+ }
+ } else {
+ /* No previous measure. */
+ new_sample = m << 3;
+ }
+
+ if (tp->rcv_rtt_est.rtt != new_sample)
+ tp->rcv_rtt_est.rtt = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+ if (tp->rcv_rtt_est.time == 0)
+ goto new_measure;
+ if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+ return;
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
+
+new_measure:
+ tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+ tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->rx_opt.rcv_tsecr &&
+ (TCP_SKB_CB(skb)->end_seq -
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int time;
+ int copied;
+
+ time = tcp_time_stamp - tp->rcvq_space.time;
+ if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
+ return;
+
+ /* Number of bytes copied to user in last RTT */
+ copied = tp->copied_seq - tp->rcvq_space.seq;
+ if (copied <= tp->rcvq_space.space)
+ goto new_measure;
+
+ /* A bit of theory :
+ * copied = bytes received in previous RTT, our base window
+ * To cope with packet losses, we need a 2x factor
+ * To cope with slow start, and sender growing its cwin by 100 %
+ * every RTT, we need a 4x factor, because the ACK we are sending
+ * now is for the next RTT, not the current one :
+ * <prev RTT . ><current RTT .. ><next RTT .... >
+ */
+
+ if (sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvwin, rcvmem, rcvbuf;
+
+ /* minimal window to cope with packet losses, assuming
+ * steady state. Add some cushion because of small variations.
+ */
+ rcvwin = (copied << 1) + 16 * tp->advmss;
+
+ /* If rate increased by 25%,
+ * assume slow start, rcvwin = 3 * copied
+ * If rate increased by 50%,
+ * assume sender can use 2x growth, rcvwin = 4 * copied
+ */
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+ rcvwin <<= 1;
+ else
+ rcvwin += (rcvwin >> 1);
+ }
+
+ rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
+
+ rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = rcvbuf;
+
+ /* Make the window clamp follow along. */
+ tp->window_clamp = rcvwin;
+ }
+ }
+ tp->rcvq_space.space = copied;
+
+new_measure:
+ tp->rcvq_space.seq = tp->copied_seq;
+ tp->rcvq_space.time = tcp_time_stamp;
+}
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval. When a
+ * connection starts up, we want to ack as quickly as possible. The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission. The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time. For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue. -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 now;
+
+ inet_csk_schedule_ack(sk);
+
+ tcp_measure_rcv_mss(sk, skb);
+
+ tcp_rcv_rtt_measure(tp);
+
+ now = tcp_time_stamp;
+
+ if (!icsk->icsk_ack.ato) {
+ /* The _first_ data packet received, initialize
+ * delayed ACK engine.
+ */
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ } else {
+ int m = now - icsk->icsk_ack.lrcvtime;
+
+ if (m <= TCP_ATO_MIN / 2) {
+ /* The fastest case is the first. */
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+ } else if (m < icsk->icsk_ack.ato) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+ if (icsk->icsk_ack.ato > icsk->icsk_rto)
+ icsk->icsk_ack.ato = icsk->icsk_rto;
+ } else if (m > icsk->icsk_rto) {
+ /* Too long gap. Apparently sender failed to
+ * restart window, so that we send ACKs quickly.
+ */
+ tcp_incr_quickack(sk);
+ sk_mem_reclaim(sk);
+ }
+ }
+ icsk->icsk_ack.lrcvtime = now;
+
+ tcp_ecn_check_ce(tp, skb);
+
+ if (skb->len >= 128)
+ tcp_grow_window(sk, skb);
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ long m = mrtt_us; /* RTT */
+ u32 srtt = tp->srtt_us;
+
+ /* The following amusing code comes from Jacobson's
+ * article in SIGCOMM '88. Note that rtt and mdev
+ * are scaled versions of rtt and mean deviation.
+ * This is designed to be as fast as possible
+ * m stands for "measurement".
+ *
+ * On a 1990 paper the rto value is changed to:
+ * RTO = rtt + 4 * mdev
+ *
+ * Funny. This algorithm seems to be very broken.
+ * These formulae increase RTO, when it should be decreased, increase
+ * too slowly, when it should be increased quickly, decrease too quickly
+ * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+ * does not matter how to _calculate_ it. Seems, it was trap
+ * that VJ failed to avoid. 8)
+ */
+ if (srtt != 0) {
+ m -= (srtt >> 3); /* m is now error in rtt est */
+ srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (m < 0) {
+ m = -m; /* m is now abs(error) */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
+ /* This is similar to one of Eifel findings.
+ * Eifel blocks mdev updates when rtt decreases.
+ * This solution is a bit different: we use finer gain
+ * for mdev in this case (alpha*beta).
+ * Like Eifel it also prevents growth of rto,
+ * but also it limits too fast rto decreases,
+ * happening in pure Eifel.
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
+ }
+ tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev_us > tp->mdev_max_us) {
+ tp->mdev_max_us = tp->mdev_us;
+ if (tp->mdev_max_us > tp->rttvar_us)
+ tp->rttvar_us = tp->mdev_max_us;
+ }
+ if (after(tp->snd_una, tp->rtt_seq)) {
+ if (tp->mdev_max_us < tp->rttvar_us)
+ tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
+ tp->rtt_seq = tp->snd_nxt;
+ tp->mdev_max_us = tcp_rto_min_us(sk);
+ }
+ } else {
+ /* no previous measure. */
+ srtt = m << 3; /* take the measured time to be rtt */
+ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+ tp->mdev_max_us = tp->rttvar_us;
+ tp->rtt_seq = tp->snd_nxt;
+ }
+ tp->srtt_us = max(1U, srtt);
+}
+
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+
+ rate *= max(tp->snd_cwnd, tp->packets_out);
+
+ if (likely(tp->srtt_us))
+ do_div(rate, tp->srtt_us);
+
+ /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ * without any lock. We want to make sure compiler wont store
+ * intermediate values in this location.
+ */
+ ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
+ sk->sk_max_pacing_rate);
+}
+
+/* Calculate rto without backoff. This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static void tcp_set_rto(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ /* Old crap is replaced with new one. 8)
+ *
+ * More seriously:
+ * 1. If rtt variance happened to be less 50msec, it is hallucination.
+ * It cannot be less due to utterly erratic ACK generation made
+ * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+ * to do with delayed acks, because at cwnd>2 true delack timeout
+ * is invisible. Actually, Linux-2.4 also generates erratic
+ * ACKs in some circumstances.
+ */
+ inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
+
+ /* 2. Fixups made earlier cannot be right.
+ * If we do not estimate RTO correctly without them,
+ * all the algo is pure shit and should be replaced
+ * with correct one. It is exactly, which we pretend to do.
+ */
+
+ /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
+ */
+ tcp_bound_rto(sk);
+}
+
+__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
+{
+ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+ if (!cwnd)
+ cwnd = TCP_INIT_CWND;
+ return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+
+/*
+ * Packet counting of FACK is based on in-order assumptions, therefore TCP
+ * disables it when reordering is detected
+ */
+void tcp_disable_fack(struct tcp_sock *tp)
+{
+ /* RFC3517 uses different metric in lost marker => reset on change */
+ if (tcp_is_fack(tp))
+ tp->lost_skb_hint = NULL;
+ tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
+}
+
+/* Take a notice that peer is sending D-SACKs */
+static void tcp_dsack_seen(struct tcp_sock *tp)
+{
+ tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+}
+
+static void tcp_update_reordering(struct sock *sk, const int metric,
+ const int ts)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (metric > tp->reordering) {
+ int mib_idx;
+
+ tp->reordering = min(sysctl_tcp_max_reordering, metric);
+
+ /* This exciting event is worth to be remembered. 8) */
+ if (ts)
+ mib_idx = LINUX_MIB_TCPTSREORDER;
+ else if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENOREORDER;
+ else if (tcp_is_fack(tp))
+ mib_idx = LINUX_MIB_TCPFACKREORDER;
+ else
+ mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+#if FASTRETRANS_DEBUG > 1
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+ tcp_disable_fack(tp);
+ }
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
+}
+
+/* This must be called before lost_out is incremented */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (!tp->retransmit_skb_hint ||
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ tp->retransmit_skb_hint = skb;
+
+ if (!tp->lost_out ||
+ after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
+ tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+}
+
+static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ tcp_verify_retransmit_hint(tp, skb);
+
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ }
+}
+
+static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ tcp_verify_retransmit_hint(tp, skb);
+
+ if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ }
+}
+
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag InFlight Description
+ * 0 1 - orig segment is in flight.
+ * S 0 - nothing flies, orig reached receiver.
+ * L 0 - nothing flies, orig lost by net.
+ * R 2 - both orig and retransmit are in flight.
+ * L|R 1 - orig is lost, retransmit is in flight.
+ * S|R 1 - orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ * but it is equivalent to plain S and code short-curcuits it to S.
+ * L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of two flavors:
+ * A. Scoreboard estimator decided the packet is lost.
+ * A'. Reno "three dupacks" marks head of queue lost.
+ * A''. Its FACK modification, head until snd.fack is lost.
+ * B. SACK arrives sacking SND.NXT at the moment, when the
+ * segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ * ever retransmitted -> reordering. Alas, we cannot use it
+ * when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ * for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself reporting
+ * SACK reneging when it should advance SND.UNA. Such SACK block this is
+ * perfectly valid, however, in light of RFC2018 which explicitly states
+ * that "SACK block MUST reflect the newest segment. Even if the newest
+ * segment is going to be discarded ...", not that it looks very clever
+ * in case of head skb. Due to potentional receiver driven attacks, we
+ * choose to avoid immediate execution of a walk in write queue due to
+ * reneging and defer head skb's loss recovery to standard loss recovery
+ * procedure that will eventually trigger (nothing forbids us doing this).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ * <- outs wnd -> <- wrapzone ->
+ * u e n u_w e_w s n_w
+ * | | | | | | |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->| |<--------...
+ * ...---- >2^31 ------>| |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, D-SACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
+ */
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+ u32 start_seq, u32 end_seq)
+{
+ /* Too far in future, or reversed (interpretation is ambiguous) */
+ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+ return false;
+
+ /* Nasty start_seq wrap-around check (see comments above) */
+ if (!before(start_seq, tp->snd_nxt))
+ return false;
+
+ /* In outstanding window? ...This is valid exit for D-SACKs too.
+ * start_seq == snd_una is non-sensical (see comments above)
+ */
+ if (after(start_seq, tp->snd_una))
+ return true;
+
+ if (!is_dsack || !tp->undo_marker)
+ return false;
+
+ /* ...Then it's D-SACK, and must reside below snd_una completely */
+ if (after(end_seq, tp->snd_una))
+ return false;
+
+ if (!before(start_seq, tp->undo_marker))
+ return true;
+
+ /* Too old */
+ if (!after(end_seq, tp->undo_marker))
+ return false;
+
+ /* Undo_marker boundary crossing (overestimates a lot). Known already:
+ * start_seq < undo_marker and end_seq >= undo_marker.
+ */
+ return !before(start_seq, end_seq - tp->max_window);
+}
+
+/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
+ * Event "B". Later note: FACK people cheated me again 8), we have to account
+ * for reordering! Ugly, but should help.
+ *
+ * Search retransmitted skbs from write_queue that were sent when snd_nxt was
+ * less than what is now known to be received by the other end (derived from
+ * highest SACK block). Also calculate the lowest snd_nxt among the remaining
+ * retransmitted skbs to avoid some costly processing per ACKs.
+ */
+static void tcp_mark_lost_retrans(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int cnt = 0;
+ u32 new_low_seq = tp->snd_nxt;
+ u32 received_upto = tcp_highest_sack_seq(tp);
+
+ if (!tcp_is_fack(tp) || !tp->retrans_out ||
+ !after(received_upto, tp->lost_retrans_low) ||
+ icsk->icsk_ca_state != TCP_CA_Recovery)
+ return;
+
+ tcp_for_write_queue(skb, sk) {
+ u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+ if (skb == tcp_send_head(sk))
+ break;
+ if (cnt == tp->retrans_out)
+ break;
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ continue;
+
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ /* TODO: We would like to get rid of tcp_is_fack(tp) only
+ * constraint here (see above) but figuring out that at
+ * least tp->reordering SACK blocks reside between ack_seq
+ * and received_upto is not easy task to do cheaply with
+ * the available datastructures.
+ *
+ * Whether FACK should check here for tp->reordering segs
+ * in-between one could argue for either way (it would be
+ * rather simple to implement as we could count fack_count
+ * during the walk and do tp->fackets_out - fack_count).
+ */
+ if (after(received_upto, ack_seq)) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+ } else {
+ if (before(ack_seq, new_low_seq))
+ new_low_seq = ack_seq;
+ cnt += tcp_skb_pcount(skb);
+ }
+ }
+
+ if (tp->retrans_out)
+ tp->lost_retrans_low = new_low_seq;
+}
+
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
+ u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
+ bool dup_sack = false;
+
+ if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+ dup_sack = true;
+ tcp_dsack_seen(tp);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ } else if (num_sacks > 1) {
+ u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
+ u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
+
+ if (!after(end_seq_0, end_seq_1) &&
+ !before(start_seq_0, start_seq_1)) {
+ dup_sack = true;
+ tcp_dsack_seen(tp);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPDSACKOFORECV);
+ }
+ }
+
+ /* D-SACK for already forgotten data... Do dumb counting. */
+ if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
+ !after(end_seq_0, prior_snd_una) &&
+ after(end_seq_0, tp->undo_marker))
+ tp->undo_retrans--;
+
+ return dup_sack;
+}
+
+struct tcp_sacktag_state {
+ int reord;
+ int fack_count;
+ long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ int flag;
+};
+
+/* Check if skb is fully within the SACK block. In presence of GSO skbs,
+ * the incoming SACK may not exactly match but we can find smaller MSS
+ * aligned portion of it that matches. Therefore we might need to fragment
+ * which may fail and creates some hassle (caller must handle error case
+ * returns).
+ *
+ * FIXME: this could be merged to shift decision code
+ */
+static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
+ u32 start_seq, u32 end_seq)
+{
+ int err;
+ bool in_sack;
+ unsigned int pkt_len;
+ unsigned int mss;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (tcp_skb_pcount(skb) > 1 && !in_sack &&
+ after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+ mss = tcp_skb_mss(skb);
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+
+ if (!in_sack) {
+ pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
+ if (pkt_len < mss)
+ pkt_len = mss;
+ } else {
+ pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
+ if (pkt_len < mss)
+ return -EINVAL;
+ }
+
+ /* Round if necessary so that SACKs cover only full MSSes
+ * and/or the remaining small portion (if present)
+ */
+ if (pkt_len > mss) {
+ unsigned int new_len = (pkt_len / mss) * mss;
+ if (!in_sack && new_len < pkt_len) {
+ new_len += mss;
+ if (new_len >= skb->len)
+ return 0;
+ }
+ pkt_len = new_len;
+ }
+ err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+ if (err < 0)
+ return err;
+ }
+
+ return in_sack;
+}
+
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+ struct tcp_sacktag_state *state, u8 sacked,
+ u32 start_seq, u32 end_seq,
+ int dup_sack, int pcount,
+ const struct skb_mstamp *xmit_time)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int fack_count = state->fack_count;
+
+ /* Account D-SACK for retransmitted packet. */
+ if (dup_sack && (sacked & TCPCB_RETRANS)) {
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
+ after(end_seq, tp->undo_marker))
+ tp->undo_retrans--;
+ if (sacked & TCPCB_SACKED_ACKED)
+ state->reord = min(fack_count, state->reord);
+ }
+
+ /* Nothing to do; acked frame is about to be dropped (was ACKed). */
+ if (!after(end_seq, tp->snd_una))
+ return sacked;
+
+ if (!(sacked & TCPCB_SACKED_ACKED)) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* If the segment is not tagged as lost,
+ * we do not clear RETRANS, believing
+ * that retransmission is still in flight.
+ */
+ if (sacked & TCPCB_LOST) {
+ sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ tp->lost_out -= pcount;
+ tp->retrans_out -= pcount;
+ }
+ } else {
+ if (!(sacked & TCPCB_RETRANS)) {
+ /* New sack for not retransmitted frame,
+ * which was in hole. It is reordering.
+ */
+ if (before(start_seq,
+ tcp_highest_sack_seq(tp)))
+ state->reord = min(fack_count,
+ state->reord);
+ if (!after(end_seq, tp->high_seq))
+ state->flag |= FLAG_ORIG_SACK_ACKED;
+ /* Pick the earliest sequence sacked for RTT */
+ if (state->rtt_us < 0) {
+ struct skb_mstamp now;
+
+ skb_mstamp_get(&now);
+ state->rtt_us = skb_mstamp_us_delta(&now,
+ xmit_time);
+ }
+ }
+
+ if (sacked & TCPCB_LOST) {
+ sacked &= ~TCPCB_LOST;
+ tp->lost_out -= pcount;
+ }
+ }
+
+ sacked |= TCPCB_SACKED_ACKED;
+ state->flag |= FLAG_DATA_SACKED;
+ tp->sacked_out += pcount;
+
+ fack_count += pcount;
+
+ /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
+ if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+ before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
+ tp->lost_cnt_hint += pcount;
+
+ if (fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+ }
+
+ /* D-SACK. We can detect redundant retransmission in S|R and plain R
+ * frames and clear it. undo_retrans is decreased above, L|R frames
+ * are accounted above as well.
+ */
+ if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
+ sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= pcount;
+ }
+
+ return sacked;
+}
+
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ unsigned int pcount, int shifted, int mss,
+ bool dup_sack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+ u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
+ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
+
+ BUG_ON(!pcount);
+
+ /* Adjust counters and hints for the newly sacked sequence
+ * range but discard the return value since prev is already
+ * marked. We must tag the range first because the seq
+ * advancement below implicitly advances
+ * tcp_highest_sack_seq() when skb is highest_sack.
+ */
+ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+ start_seq, end_seq, dup_sack, pcount,
+ &skb->skb_mstamp);
+
+ if (skb == tp->lost_skb_hint)
+ tp->lost_cnt_hint += pcount;
+
+ TCP_SKB_CB(prev)->end_seq += shifted;
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ tcp_skb_pcount_add(prev, pcount);
+ BUG_ON(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+ * in theory this shouldn't be necessary but as long as DSACK
+ * code can come after this skb later on it's better to keep
+ * setting gso_size to something.
+ */
+ if (!skb_shinfo(prev)->gso_size) {
+ skb_shinfo(prev)->gso_size = mss;
+ skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+ }
+
+ /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+ if (tcp_skb_pcount(skb) <= 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ }
+
+ /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+ if (skb->len > 0) {
+ BUG_ON(!tcp_skb_pcount(skb));
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ return false;
+ }
+
+ /* Whole SKB was eaten :-) */
+
+ if (skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = prev;
+ if (skb == tp->lost_skb_hint) {
+ tp->lost_skb_hint = prev;
+ tp->lost_cnt_hint -= tcp_skb_pcount(prev);
+ }
+
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
+ if (skb == tcp_highest_sack(sk))
+ tcp_advance_highest_sack(sk, skb);
+
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+
+ return true;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_skb_seglen(const struct sk_buff *skb)
+{
+ return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(const struct sk_buff *skb)
+{
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ u32 start_seq, u32 end_seq,
+ bool dup_sack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
+ int pcount = 0;
+ int len;
+ int in_sack;
+
+ if (!sk_can_gso(sk))
+ goto fallback;
+
+ /* Normally R but no L won't result in plain S */
+ if (!dup_sack &&
+ (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
+ goto fallback;
+ if (!skb_can_shift(skb))
+ goto fallback;
+ /* This frame is about to be dropped (was ACKed). */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ goto fallback;
+
+ /* Can only happen with delayed DSACK + discard craziness */
+ if (unlikely(skb == tcp_write_queue_head(sk)))
+ goto fallback;
+ prev = tcp_write_queue_prev(sk, skb);
+
+ if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto fallback;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (in_sack) {
+ len = skb->len;
+ pcount = tcp_skb_pcount(skb);
+ mss = tcp_skb_seglen(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_skb_seglen(prev))
+ goto fallback;
+ } else {
+ if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+ goto noop;
+ /* CHECKME: This is non-MSS split case only?, this will
+ * cause skipped skbs due to advancing loop btw, original
+ * has that feature too
+ */
+ if (tcp_skb_pcount(skb) <= 1)
+ goto noop;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+ if (!in_sack) {
+ /* TODO: head merge to next could be attempted here
+ * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+ * though it might not be worth of the additional hassle
+ *
+ * ...we can probably just fallback to what was done
+ * previously. We could try merging non-SACKed ones
+ * as well but it probably isn't going to buy off
+ * because later SACKs might again split them, and
+ * it would make skb timestamp tracking considerably
+ * harder problem.
+ */
+ goto fallback;
+ }
+
+ len = end_seq - TCP_SKB_CB(skb)->seq;
+ BUG_ON(len < 0);
+ BUG_ON(len > skb->len);
+
+ /* MSS boundaries should be honoured or else pcount will
+ * severely break even though it makes things bit trickier.
+ * Optimize common case to avoid most of the divides
+ */
+ mss = tcp_skb_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_skb_seglen(prev))
+ goto fallback;
+
+ if (len == mss) {
+ pcount = 1;
+ } else if (len < mss) {
+ goto noop;
+ } else {
+ pcount = len / mss;
+ len = pcount * mss;
+ }
+ }
+
+ /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
+ if (!skb_shift(prev, skb, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+ goto out;
+
+ /* Hole filled allows collapsing with the next as well, this is very
+ * useful when hole on every nth skb pattern happens
+ */
+ if (prev == tcp_write_queue_tail(sk))
+ goto out;
+ skb = tcp_write_queue_next(sk, prev);
+
+ if (!skb_can_shift(skb) ||
+ (skb == tcp_send_head(sk)) ||
+ ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
+ (mss != tcp_skb_seglen(skb)))
+ goto out;
+
+ len = skb->len;
+ if (skb_shift(prev, skb, len)) {
+ pcount += tcp_skb_pcount(skb);
+ tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+ }
+
+out:
+ state->fack_count += pcount;
+ return prev;
+
+noop:
+ return skb;
+
+fallback:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ struct tcp_sacktag_state *state,
+ u32 start_seq, u32 end_seq,
+ bool dup_sack_in)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *tmp;
+
+ tcp_for_write_queue_from(skb, sk) {
+ int in_sack = 0;
+ bool dup_sack = dup_sack_in;
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* queue is in-order => we can short-circuit the walk early */
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ if (next_dup &&
+ before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ next_dup->start_seq,
+ next_dup->end_seq);
+ if (in_sack > 0)
+ dup_sack = true;
+ }
+
+ /* skb reference here is a bit tricky to get right, since
+ * shifting can eat and free both this skb and the next,
+ * so not even _safe variant of the loop is enough.
+ */
+ if (in_sack <= 0) {
+ tmp = tcp_shift_skb_data(sk, skb, state,
+ start_seq, end_seq, dup_sack);
+ if (tmp) {
+ if (tmp != skb) {
+ skb = tmp;
+ continue;
+ }
+
+ in_sack = 0;
+ } else {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ start_seq,
+ end_seq);
+ }
+ }
+
+ if (unlikely(in_sack < 0))
+ break;
+
+ if (in_sack) {
+ TCP_SKB_CB(skb)->sacked =
+ tcp_sacktag_one(sk,
+ state,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ dup_sack,
+ tcp_skb_pcount(skb),
+ &skb->skb_mstamp);
+
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tcp_highest_sack_seq(tp)))
+ tcp_advance_highest_sack(sk, skb);
+ }
+
+ state->fack_count += tcp_skb_pcount(skb);
+ }
+ return skb;
+}
+
+/* Avoid all extra work that is being done by sacktag while walking in
+ * a normal way
+ */
+static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
+ struct tcp_sacktag_state *state,
+ u32 skip_to_seq)
+{
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
+ break;
+
+ state->fack_count += tcp_skb_pcount(skb);
+ }
+ return skb;
+}
+
+static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
+ struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ struct tcp_sacktag_state *state,
+ u32 skip_to_seq)
+{
+ if (!next_dup)
+ return skb;
+
+ if (before(next_dup->start_seq, skip_to_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
+ skb = tcp_sacktag_walk(skb, sk, NULL, state,
+ next_dup->start_seq, next_dup->end_seq,
+ 1);
+ }
+
+ return skb;
+}
+
+static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
+{
+ return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+}
+
+static int
+tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_snd_una, long *sack_rtt_us)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const unsigned char *ptr = (skb_transport_header(ack_skb) +
+ TCP_SKB_CB(ack_skb)->sacked);
+ struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
+ struct tcp_sack_block sp[TCP_NUM_SACKS];
+ struct tcp_sack_block *cache;
+ struct tcp_sacktag_state state;
+ struct sk_buff *skb;
+ int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
+ int used_sacks;
+ bool found_dup_sack = false;
+ int i, j;
+ int first_sack_index;
+
+ state.flag = 0;
+ state.reord = tp->packets_out;
+ state.rtt_us = -1L;
+
+ if (!tp->sacked_out) {
+ if (WARN_ON(tp->fackets_out))
+ tp->fackets_out = 0;
+ tcp_highest_sack_reset(sk);
+ }
+
+ found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
+ num_sacks, prior_snd_una);
+ if (found_dup_sack)
+ state.flag |= FLAG_DSACKING_ACK;
+
+ /* Eliminate too old ACKs, but take into
+ * account more or less fresh ones, they can
+ * contain valid SACK info.
+ */
+ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
+ return 0;
+
+ if (!tp->packets_out)
+ goto out;
+
+ used_sacks = 0;
+ first_sack_index = 0;
+ for (i = 0; i < num_sacks; i++) {
+ bool dup_sack = !i && found_dup_sack;
+
+ sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
+ sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
+
+ if (!tcp_is_sackblock_valid(tp, dup_sack,
+ sp[used_sacks].start_seq,
+ sp[used_sacks].end_seq)) {
+ int mib_idx;
+
+ if (dup_sack) {
+ if (!tp->undo_marker)
+ mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
+ else
+ mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
+ } else {
+ /* Don't count olds caused by ACK reordering */
+ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
+ !after(sp[used_sacks].end_seq, tp->snd_una))
+ continue;
+ mib_idx = LINUX_MIB_TCPSACKDISCARD;
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ if (i == 0)
+ first_sack_index = -1;
+ continue;
+ }
+
+ /* Ignore very old stuff early */
+ if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ continue;
+
+ used_sacks++;
+ }
+
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = used_sacks - 1; i > 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
+ swap(sp[j], sp[j + 1]);
+
+ /* Track where the first SACK block goes to */
+ if (j == first_sack_index)
+ first_sack_index = j + 1;
+ }
+ }
+ }
+
+ skb = tcp_write_queue_head(sk);
+ state.fack_count = 0;
+ i = 0;
+
+ if (!tp->sacked_out) {
+ /* It's already past, so skip checking against it */
+ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+ } else {
+ cache = tp->recv_sack_cache;
+ /* Skip empty blocks in at head of the cache */
+ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
+ !cache->end_seq)
+ cache++;
+ }
+
+ while (i < used_sacks) {
+ u32 start_seq = sp[i].start_seq;
+ u32 end_seq = sp[i].end_seq;
+ bool dup_sack = (found_dup_sack && (i == first_sack_index));
+ struct tcp_sack_block *next_dup = NULL;
+
+ if (found_dup_sack && ((i + 1) == first_sack_index))
+ next_dup = &sp[i + 1];
+
+ /* Skip too early cached blocks */
+ while (tcp_sack_cache_ok(tp, cache) &&
+ !before(start_seq, cache->end_seq))
+ cache++;
+
+ /* Can skip some work by looking recv_sack_cache? */
+ if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
+ after(end_seq, cache->start_seq)) {
+
+ /* Head todo? */
+ if (before(start_seq, cache->start_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, &state,
+ start_seq);
+ skb = tcp_sacktag_walk(skb, sk, next_dup,
+ &state,
+ start_seq,
+ cache->start_seq,
+ dup_sack);
+ }
+
+ /* Rest of the block already fully processed? */
+ if (!after(end_seq, cache->end_seq))
+ goto advance_sp;
+
+ skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
+ &state,
+ cache->end_seq);
+
+ /* ...tail remains todo... */
+ if (tcp_highest_sack_seq(tp) == cache->end_seq) {
+ /* ...but better entrypoint exists! */
+ skb = tcp_highest_sack(sk);
+ if (!skb)
+ break;
+ state.fack_count = tp->fackets_out;
+ cache++;
+ goto walk;
+ }
+
+ skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+ /* Check overlap against next cached too (past this one already) */
+ cache++;
+ continue;
+ }
+
+ if (!before(start_seq, tcp_highest_sack_seq(tp))) {
+ skb = tcp_highest_sack(sk);
+ if (!skb)
+ break;
+ state.fack_count = tp->fackets_out;
+ }
+ skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+
+walk:
+ skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+ start_seq, end_seq, dup_sack);
+
+advance_sp:
+ i++;
+ }
+
+ /* Clear the head of the cache sack blocks so we can skip it next time */
+ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
+ tp->recv_sack_cache[i].start_seq = 0;
+ tp->recv_sack_cache[i].end_seq = 0;
+ }
+ for (j = 0; j < used_sacks; j++)
+ tp->recv_sack_cache[i++] = sp[j];
+
+ if ((state.reord < tp->fackets_out) &&
+ ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
+ tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+
+ tcp_mark_lost_retrans(sk);
+ tcp_verify_left_out(tp);
+out:
+
+#if FASTRETRANS_DEBUG > 0
+ WARN_ON((int)tp->sacked_out < 0);
+ WARN_ON((int)tp->lost_out < 0);
+ WARN_ON((int)tp->retrans_out < 0);
+ WARN_ON((int)tcp_packets_in_flight(tp) < 0);
+#endif
+ *sack_rtt_us = state.rtt_us;
+ return state.flag;
+}
+
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
+ */
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
+{
+ u32 holes;
+
+ holes = max(tp->lost_out, 1U);
+ holes = min(holes, tp->packets_out);
+
+ if ((tp->sacked_out + holes) > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
+ return true;
+ }
+ return false;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tcp_limit_reno_sacked(tp))
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->sacked_out++;
+ tcp_check_reno_reordering(sk, 0);
+ tcp_verify_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (acked > 0) {
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
+ if (acked - 1 >= tp->sacked_out)
+ tp->sacked_out = 0;
+ else
+ tp->sacked_out -= acked - 1;
+ }
+ tcp_check_reno_reordering(sk, acked);
+ tcp_verify_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out = 0;
+}
+
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+ tp->retrans_out = 0;
+ tp->lost_out = 0;
+ tp->undo_marker = 0;
+ tp->undo_retrans = -1;
+ tp->fackets_out = 0;
+ tp->sacked_out = 0;
+}
+
+static inline void tcp_init_undo(struct tcp_sock *tp)
+{
+ tp->undo_marker = tp->snd_una;
+ /* Retransmission still in flight may cause DSACKs later. */
+ tp->undo_retrans = tp->retrans_out ? : -1;
+}
+
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+void tcp_enter_loss(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ bool new_recovery = false;
+ bool is_reneg; /* is receiver reneging on SACKs? */
+
+ /* Reduce ssthresh if it has not yet been made inside this window. */
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+ !after(tp->high_seq, tp->snd_una) ||
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ new_recovery = true;
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_LOSS);
+ tcp_init_undo(tp);
+ }
+ tp->snd_cwnd = 1;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ tp->retrans_out = 0;
+ tp->lost_out = 0;
+
+ if (tcp_is_reno(tp))
+ tcp_reset_reno_sack(tp);
+
+ skb = tcp_write_queue_head(sk);
+ is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ tp->sacked_out = 0;
+ tp->fackets_out = 0;
+ }
+ tcp_clear_all_retrans_hints(tp);
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+ }
+ }
+ tcp_verify_left_out(tp);
+
+ /* Timeout in disordered state after receiving substantial DUPACKs
+ * suggests that the degree of reordering is over-estimated.
+ */
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+ tp->sacked_out >= sysctl_tcp_reordering)
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ sysctl_tcp_reordering);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ tp->high_seq = tp->snd_nxt;
+ tcp_ecn_queue_cwr(tp);
+
+ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+ * loss recovery is underway except recurring timeout(s) on
+ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ */
+ tp->frto = sysctl_tcp_frto &&
+ (new_recovery || icsk->icsk_retransmits) &&
+ !inet_csk(sk)->icsk_mtup.probe_size;
+}
+
+/* If ACK arrived pointing to a remembered SACK, it means that our
+ * remembered SACKs do not reflect real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ *
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
+ */
+static bool tcp_check_sack_reneging(struct sock *sk, int flag)
+{
+ if (flag & FLAG_SACK_RENEGING) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+ msecs_to_jiffies(10));
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ delay, TCP_RTO_MAX);
+ return true;
+ }
+ return false;
+}
+
+static inline int tcp_fackets_out(const struct tcp_sock *tp)
+{
+ return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
+}
+
+/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
+ * counter when SACK is enabled (without SACK, sacked_out is used for
+ * that purpose).
+ *
+ * Instead, with FACK TCP uses fackets_out that includes both SACKed
+ * segments up to the highest received SACK block so far and holes in
+ * between them.
+ *
+ * With reordering, holes may still be in flight, so RFC3517 recovery
+ * uses pure sacked_out (total number of SACKed segments) even though
+ * it violates the RFC that uses duplicate ACKs, often these are equal
+ * but when e.g. out-of-window ACKs or packet duplication occurs,
+ * they differ. Since neither occurs due to loss, TCP should really
+ * ignore them.
+ */
+static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
+{
+ return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+}
+
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay;
+
+ /* Delay early retransmit and entering fast recovery for
+ * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+ * available, or RTO is scheduled to fire first.
+ */
+ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
+ (flag & FLAG_ECE) || !tp->srtt_us)
+ return false;
+
+ delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
+ msecs_to_jiffies(2));
+
+ if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+ return false;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* Linux NewReno/SACK/FACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open" Normal state, no dubious events, fast path.
+ * "Disorder" In all the respects it is "Open",
+ * but requires a bit more attention. It is entered when
+ * we see some SACKs or dupacks. It is split of "Open"
+ * mainly to move some processing from fast path to slow one.
+ * "CWR" CWND was reduced due to some Congestion Notification event.
+ * It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery" CWND was reduced, we are fast-retransmitting.
+ * "Loss" CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ * * SACK
+ * * Duplicate ACK.
+ * * ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ * in_flight = packets_out - left_out + retrans_out
+ *
+ * packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ * retrans_out is number of retransmitted segments.
+ *
+ * left_out is number of segments left network, but not ACKed yet.
+ *
+ * left_out = sacked_out + lost_out
+ *
+ * sacked_out: Packets, which arrived to receiver out of order
+ * and hence not ACKed. With SACKs this number is simply
+ * amount of SACKed data. Even without SACKs
+ * it is easy to give pretty reliable estimate of this number,
+ * counting duplicate ACKs.
+ *
+ * lost_out: Packets lost by network. TCP has no explicit
+ * "loss notification" feedback from network (for now).
+ * It means that this number can be only _guessed_.
+ * Actually, it is the heuristics to predict lossage that
+ * distinguishes different algorithms.
+ *
+ * F.e. after RTO, when all the queue is considered as lost,
+ * lost_out = packets_out and in_flight = retrans_out.
+ *
+ * Essentially, we have now two algorithms counting
+ * lost packets.
+ *
+ * FACK: It is the simplest heuristics. As soon as we decided
+ * that something is lost, we decide that _all_ not SACKed
+ * packets until the most forward SACK are lost. I.e.
+ * lost_out = fackets_out - sacked_out and left_out = fackets_out.
+ * It is absolutely correct estimate, if network does not reorder
+ * packets. And it loses any connection to reality when reordering
+ * takes place. We use FACK by default until reordering
+ * is suspected on the path to this destination.
+ *
+ * NewReno: when Recovery is entered, we assume that one segment
+ * is lost (classic Reno). While we are in Recovery and
+ * a partial ACK arrives, we assume that one more packet
+ * is lost (NewReno). This heuristics are the same in NewReno
+ * and SACK.
+ *
+ * Imagine, that's all! Forget about all this shamanism about CWND inflation
+ * deflation etc. CWND is real congestion window, never inflated, changes
+ * only according to classic VJ rules.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static bool tcp_time_to_recover(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 packets_out;
+
+ /* Trick#1: The loss is proven. */
+ if (tp->lost_out)
+ return true;
+
+ /* Not-A-Trick#2 : Classic rule... */
+ if (tcp_dupack_heuristics(tp) > tp->reordering)
+ return true;
+
+ /* Trick#4: It is still not OK... But will it be useful to delay
+ * recovery more?
+ */
+ packets_out = tp->packets_out;
+ if (packets_out <= tp->reordering &&
+ tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+ !tcp_may_send_now(sk)) {
+ /* We have nothing to send. This connection is limited
+ * either by receiver window or by application.
+ */
+ return true;
+ }
+
+ /* If a thin stream is detected, retransmit after first
+ * received dupack. Employ only if SACK is supported in order
+ * to avoid possible corner-case series of spurious retransmissions
+ * Use only if there are no unsent data.
+ */
+ if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
+ tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
+ tcp_is_sack(tp) && !tcp_send_head(sk))
+ return true;
+
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return !tcp_pause_early_retransmit(sk, flag);
+
+ return false;
+}
+
+/* Detect loss in event "A" above by marking head of queue up as lost.
+ * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * are considered lost. For RFC3517 SACK, a segment is considered lost if it
+ * has at least tp->reordering SACKed seqments above it; "packets" refers to
+ * the maximum SACKed segments to pass before reaching this limit.
+ */
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int cnt, oldcnt;
+ int err;
+ unsigned int mss;
+ /* Use SACK to deduce losses of new sequences sent during recovery */
+ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
+
+ WARN_ON(packets > tp->packets_out);
+ if (tp->lost_skb_hint) {
+ skb = tp->lost_skb_hint;
+ cnt = tp->lost_cnt_hint;
+ /* Head already handled? */
+ if (mark_head && skb != tcp_write_queue_head(sk))
+ return;
+ } else {
+ skb = tcp_write_queue_head(sk);
+ cnt = 0;
+ }
+
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ /* TODO: do this better */
+ /* this is not the most efficient way to do this... */
+ tp->lost_skb_hint = skb;
+ tp->lost_cnt_hint = cnt;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
+ break;
+
+ oldcnt = cnt;
+ if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ cnt += tcp_skb_pcount(skb);
+
+ if (cnt > packets) {
+ if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ (oldcnt >= packets))
+ break;
+
+ mss = skb_shinfo(skb)->gso_size;
+ err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
+ mss, GFP_ATOMIC);
+ if (err < 0)
+ break;
+ cnt = packets;
+ }
+
+ tcp_skb_mark_lost(tp, skb);
+
+ if (mark_head)
+ break;
+ }
+ tcp_verify_left_out(tp);
+}
+
+/* Account newly detected lost packet(s) */
+
+static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_is_reno(tp)) {
+ tcp_mark_head_lost(sk, 1, 1);
+ } else if (tcp_is_fack(tp)) {
+ int lost = tp->fackets_out - tp->reordering;
+ if (lost <= 0)
+ lost = 1;
+ tcp_mark_head_lost(sk, lost, 0);
+ } else {
+ int sacked_upto = tp->sacked_out - tp->reordering;
+ if (sacked_upto >= 0)
+ tcp_mark_head_lost(sk, sacked_upto, 0);
+ else if (fast_rexmit)
+ tcp_mark_head_lost(sk, 1, 1);
+ }
+}
+
+/* CWND moderation, preventing bursts due to too big ACKs
+ * in dubious situations.
+ */
+static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+{
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp) + tcp_max_burst(tp));
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
+{
+ return !tp->retrans_stamp ||
+ (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+}
+
+/* Undo procedures. */
+
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static bool tcp_any_retrans_done(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (tp->retrans_out)
+ return true;
+
+ skb = tcp_write_queue_head(sk);
+ if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+ return true;
+
+ return false;
+}
+
+#if FASTRETRANS_DEBUG > 1
+static void DBGUNDO(struct sock *sk, const char *msg)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (sk->sk_family == AF_INET) {
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &np->daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+ }
+#endif
+}
+#else
+#define DBGUNDO(x...) do { } while (0)
+#endif
+
+static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unmark_loss) {
+ struct sk_buff *skb;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ }
+ tp->lost_out = 0;
+ tcp_clear_all_retrans_hints(tp);
+ }
+
+ if (tp->prior_ssthresh) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->undo_cwnd)
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
+ else
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+
+ if (tp->prior_ssthresh > tp->snd_ssthresh) {
+ tp->snd_ssthresh = tp->prior_ssthresh;
+ tcp_ecn_withdraw_cwr(tp);
+ }
+ } else {
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->undo_marker = 0;
+}
+
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
+{
+ return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+
+/* People celebrate: "We love our President!" */
+static bool tcp_try_undo_recovery(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_may_undo(tp)) {
+ int mib_idx;
+
+ /* Happy end! We did not retransmit anything
+ * or our original transmission succeeded.
+ */
+ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwnd_reduction(sk, false);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+ mib_idx = LINUX_MIB_TCPLOSSUNDO;
+ else
+ mib_idx = LINUX_MIB_TCPFULLUNDO;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ }
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+ /* Hold old state until something *above* high_seq
+ * is ACKed. For Reno it is MUST to prevent false
+ * fast retransmits (RFC2582). SACK TCP is safe. */
+ tcp_moderate_cwnd(tp);
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+ return true;
+ }
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ return false;
+}
+
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static bool tcp_try_undo_dsack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->undo_marker && !tp->undo_retrans) {
+ DBGUNDO(sk, "D-SACK");
+ tcp_undo_cwnd_reduction(sk, false);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ return true;
+ }
+ return false;
+}
+
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (frto_undo || tcp_may_undo(tp)) {
+ tcp_undo_cwnd_reduction(sk, true);
+
+ DBGUNDO(sk, "partial loss");
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ if (frto_undo)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
+ inet_csk(sk)->icsk_retransmits = 0;
+ if (frto_undo || tcp_is_sack(tp))
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ return true;
+ }
+ return false;
+}
+
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ * 1) If the packets in flight is larger than ssthresh, PRR spreads the
+ * cwnd reductions across a full RTT.
+ * 2) If packets in flight is lower than ssthresh (such as due to excess
+ * losses and/or application stalls), do not perform any further cwnd
+ * reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tcp_ecn_queue_cwr(tp);
+}
+
+static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
+ int fast_rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int sndcnt = 0;
+ int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+ int newly_acked_sacked = prior_unsacked -
+ (tp->packets_out - tp->sacked_out);
+
+ tp->prr_delivered += newly_acked_sacked;
+ if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+ tp->prior_cwnd - 1;
+ sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+ } else {
+ sndcnt = min_t(int, delta,
+ max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked) + 1);
+ }
+
+ sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+ (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
+}
+
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->prior_ssthresh = 0;
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ tp->undo_marker = 0;
+ tcp_init_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ }
+}
+
+static void tcp_try_keep_open(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int state = TCP_CA_Open;
+
+ if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
+ state = TCP_CA_Disorder;
+
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
+ tp->high_seq = tp->snd_nxt;
+ }
+}
+
+static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_verify_left_out(tp);
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ if (flag & FLAG_ECE)
+ tcp_enter_cwr(sk);
+
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
+ tcp_try_keep_open(sk);
+ } else {
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ }
+}
+
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+ icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* FIXME: breaks with very large cwnd */
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = tp->snd_cwnd *
+ tcp_mss_to_mtu(sk, tp->mss_cache) /
+ icsk->icsk_mtup.probe_size;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+
+ icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+ icsk->icsk_mtup.probe_size = 0;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int mss = tcp_current_mss(sk);
+ u32 prior_lost = tp->lost_out;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ if (tcp_skb_seglen(skb) > mss &&
+ !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ }
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ }
+ }
+
+ tcp_clear_retrans_hints_partial(tp);
+
+ if (prior_lost == tp->lost_out)
+ return;
+
+ if (tcp_is_reno(tp))
+ tcp_limit_reno_sacked(tp);
+
+ tcp_verify_left_out(tp);
+
+ /* Don't muck with the congestion window here.
+ * Reason is that we do not increase amount of _data_
+ * in network, but units changed and effective
+ * cwnd/ssthresh really reduced now.
+ */
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+EXPORT_SYMBOL(tcp_simple_retransmit);
+
+static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
+
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->prior_ssthresh = 0;
+ tcp_init_undo(tp);
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tcp_init_cwnd_reduction(sk);
+ }
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool recovered = !before(tp->snd_una, tp->high_seq);
+
+ if ((flag & FLAG_SND_UNA_ADVANCED) &&
+ tcp_try_undo_loss(sk, false))
+ return;
+
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, true))
+ return;
+
+ if (after(tp->snd_nxt, tp->high_seq)) {
+ if (flag & FLAG_DATA_SACKED || is_dupack)
+ tp->frto = 0; /* Step 3.a. loss was real */
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+ tp->high_seq = tp->snd_nxt;
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return; /* Step 2.b */
+ tp->frto = 0;
+ }
+ }
+
+ if (recovered) {
+ /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+ tcp_try_undo_recovery(sk);
+ return;
+ }
+ if (tcp_is_reno(tp)) {
+ /* A Reno DUPACK means new data in F-RTO step 2.b above are
+ * delivered. Lower inflight to clock out (re)tranmissions.
+ */
+ if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+ tcp_add_reno_sack(sk);
+ else if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* Undo during fast recovery after partial ACK. */
+static bool tcp_try_undo_partial(struct sock *sk, const int acked,
+ const int prior_unsacked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->undo_marker && tcp_packet_delayed(tp)) {
+ /* Plain luck! Hole if filled with delayed
+ * packet, rather than with a retransmit.
+ */
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+
+ /* We are getting evidence that the reordering degree is higher
+ * than we realized. If there are no retransmits out then we
+ * can undo. Otherwise we clock out new packets but do not
+ * mark more packets lost or retransmit more.
+ */
+ if (tp->retrans_out) {
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ return true;
+ }
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ DBGUNDO(sk, "partial recovery");
+ tcp_undo_cwnd_reduction(sk, true);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ tcp_try_keep_open(sk);
+ return true;
+ }
+ return false;
+}
+
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it does CWND reduction, when packet loss is detected
+ * and changes state of machine.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+ const int prior_unsacked,
+ bool is_dupack, int flag)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+ (tcp_fackets_out(tp) > tp->reordering));
+ int fast_rexmit = 0;
+
+ if (WARN_ON(!tp->packets_out && tp->sacked_out))
+ tp->sacked_out = 0;
+ if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+ tp->fackets_out = 0;
+
+ /* Now state machine starts.
+ * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+ if (flag & FLAG_ECE)
+ tp->prior_ssthresh = 0;
+
+ /* B. In all the states check for reneging SACKs. */
+ if (tcp_check_sack_reneging(sk, flag))
+ return;
+
+ /* C. Check consistency of the current state. */
+ tcp_verify_left_out(tp);
+
+ /* D. Check state exit conditions. State can be terminated
+ * when high_seq is ACKed. */
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
+ WARN_ON(tp->retrans_out != 0);
+ tp->retrans_stamp = 0;
+ } else if (!before(tp->snd_una, tp->high_seq)) {
+ switch (icsk->icsk_ca_state) {
+ case TCP_CA_CWR:
+ /* CWR is to be held something *above* high_seq
+ * is ACKed for CWR bit to reach receiver. */
+ if (tp->snd_una != tp->high_seq) {
+ tcp_end_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ }
+ break;
+
+ case TCP_CA_Recovery:
+ if (tcp_is_reno(tp))
+ tcp_reset_reno_sack(tp);
+ if (tcp_try_undo_recovery(sk))
+ return;
+ tcp_end_cwnd_reduction(sk);
+ break;
+ }
+ }
+
+ /* E. Process state. */
+ switch (icsk->icsk_ca_state) {
+ case TCP_CA_Recovery:
+ if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+ if (tcp_is_reno(tp) && is_dupack)
+ tcp_add_reno_sack(sk);
+ } else {
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ return;
+ /* Partial ACK arrived. Force fast retransmit. */
+ do_lost = tcp_is_reno(tp) ||
+ tcp_fackets_out(tp) > tp->reordering;
+ }
+ if (tcp_try_undo_dsack(sk)) {
+ tcp_try_keep_open(sk);
+ return;
+ }
+ break;
+ case TCP_CA_Loss:
+ tcp_process_loss(sk, flag, is_dupack);
+ if (icsk->icsk_ca_state != TCP_CA_Open)
+ return;
+ /* Fall through to processing in Open state. */
+ default:
+ if (tcp_is_reno(tp)) {
+ if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ if (is_dupack)
+ tcp_add_reno_sack(sk);
+ }
+
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+ tcp_try_undo_dsack(sk);
+
+ if (!tcp_time_to_recover(sk, flag)) {
+ tcp_try_to_open(sk, flag, prior_unsacked);
+ return;
+ }
+
+ /* MTU probe failure: don't reduce cwnd */
+ if (icsk->icsk_ca_state < TCP_CA_CWR &&
+ icsk->icsk_mtup.probe_size &&
+ tp->snd_una == tp->mtu_probe.probe_seq_start) {
+ tcp_mtup_probe_failed(sk);
+ /* Restores the reduction we did in tcp_mtup_probe() */
+ tp->snd_cwnd++;
+ tcp_simple_retransmit(sk);
+ return;
+ }
+
+ /* Otherwise enter Recovery state */
+ tcp_enter_recovery(sk, (flag & FLAG_ECE));
+ fast_rexmit = 1;
+ }
+
+ if (do_lost)
+ tcp_update_scoreboard(sk, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+ tcp_xmit_retransmit_queue(sk);
+}
+
+static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ long seq_rtt_us, long sack_rtt_us)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+ * broken middle-boxes or peers may corrupt TS-ECR fields. But
+ * Karn's algorithm forbids taking RTT if some retransmitted data
+ * is acked (RFC6298).
+ */
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ seq_rtt_us = -1L;
+
+ if (seq_rtt_us < 0)
+ seq_rtt_us = sack_rtt_us;
+
+ /* RTTM Rule: A TSecr value received in a segment is used to
+ * update the averaged RTT measurement only if the segment
+ * acknowledges some new data, i.e., only if it advances the
+ * left edge of the send window.
+ * See draft-ietf-tcplw-high-performance-00, section 3.3.
+ */
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ flag & FLAG_ACKED)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+
+ if (seq_rtt_us < 0)
+ return false;
+
+ tcp_rtt_estimator(sk, seq_rtt_us);
+ tcp_set_rto(sk);
+
+ /* RFC6298: only reset backoff on valid RTT measurement. */
+ inet_csk(sk)->icsk_backoff = 0;
+ return true;
+}
+
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ long seq_rtt_us = -1L;
+
+ if (synack_stamp && !tp->total_retrans)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
+
+ /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+ * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+ */
+ if (!tp->srtt_us)
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
+}
+
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
+ tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+void tcp_rearm_rto(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If the retrans timer is currently being used by Fast Open
+ * for SYN-ACK retrans purpose, stay put.
+ */
+ if (tp->fastopen_rsk)
+ return;
+
+ if (!tp->packets_out) {
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ } else {
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+ const u32 rto_time_stamp =
+ tcp_skb_timestamp(skb) + rto;
+ s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+ /* delta may not be positive if the socket is locked
+ * when the retrans timer fires and is rescheduled.
+ */
+ if (delta > 0)
+ rto = delta;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX);
+ }
+}
+
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_rearm_rto(sk);
+
+ /* Stop if ER is disabled after the delayed ER timer is scheduled */
+ if (!tp->do_early_retrans)
+ return;
+
+ tcp_enter_recovery(sk, false);
+ tcp_update_scoreboard(sk, 1);
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 packets_acked;
+
+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
+
+ packets_acked = tcp_skb_pcount(skb);
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+ return 0;
+ packets_acked -= tcp_skb_pcount(skb);
+
+ if (packets_acked) {
+ BUG_ON(tcp_skb_pcount(skb) == 0);
+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
+ }
+
+ return packets_acked;
+}
+
+static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+ u32 prior_snd_una)
+{
+ const struct skb_shared_info *shinfo;
+
+ /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
+ if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ return;
+
+ shinfo = skb_shinfo(skb);
+ if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+ between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+}
+
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+ u32 prior_snd_una, long sack_rtt_us)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct skb_mstamp first_ackt, last_ackt, now;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ u32 reord = tp->packets_out;
+ bool fully_acked = true;
+ long ca_seq_rtt_us = -1L;
+ long seq_rtt_us = -1L;
+ struct sk_buff *skb;
+ u32 pkts_acked = 0;
+ bool rtt_update;
+ int flag = 0;
+
+ first_ackt.v64 = 0;
+
+ while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ u8 sacked = scb->sacked;
+ u32 acked_pcount;
+
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
+
+ /* Determine how many packets and what bytes were acked, tso and else */
+ if (after(scb->end_seq, tp->snd_una)) {
+ if (tcp_skb_pcount(skb) == 1 ||
+ !after(tp->snd_una, scb->seq))
+ break;
+
+ acked_pcount = tcp_tso_acked(sk, skb);
+ if (!acked_pcount)
+ break;
+
+ fully_acked = false;
+ } else {
+ /* Speedup tcp_unlink_write_queue() and next loop */
+ prefetchw(skb->next);
+ acked_pcount = tcp_skb_pcount(skb);
+ }
+
+ if (unlikely(sacked & TCPCB_RETRANS)) {
+ if (sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= acked_pcount;
+ flag |= FLAG_RETRANS_DATA_ACKED;
+ } else if (!(sacked & TCPCB_SACKED_ACKED)) {
+ last_ackt = skb->skb_mstamp;
+ WARN_ON_ONCE(last_ackt.v64 == 0);
+ if (!first_ackt.v64)
+ first_ackt = last_ackt;
+
+ reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
+ }
+
+ if (sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= acked_pcount;
+ if (sacked & TCPCB_LOST)
+ tp->lost_out -= acked_pcount;
+
+ tp->packets_out -= acked_pcount;
+ pkts_acked += acked_pcount;
+
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
+ flag |= FLAG_DATA_ACKED;
+ } else {
+ flag |= FLAG_SYN_ACKED;
+ tp->retrans_stamp = 0;
+ }
+
+ if (!fully_acked)
+ break;
+
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ if (unlikely(skb == tp->retransmit_skb_hint))
+ tp->retransmit_skb_hint = NULL;
+ if (unlikely(skb == tp->lost_skb_hint))
+ tp->lost_skb_hint = NULL;
+ }
+
+ if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+ tp->snd_up = tp->snd_una;
+
+ if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ flag |= FLAG_SACK_RENEGING;
+
+ skb_mstamp_get(&now);
+ if (likely(first_ackt.v64)) {
+ seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
+ ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+
+ if (flag & FLAG_ACKED) {
+ const struct tcp_congestion_ops *ca_ops
+ = inet_csk(sk)->icsk_ca_ops;
+
+ tcp_rearm_rto(sk);
+ if (unlikely(icsk->icsk_mtup.probe_size &&
+ !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+ tcp_mtup_probe_success(sk);
+ }
+
+ if (tcp_is_reno(tp)) {
+ tcp_remove_reno_sacks(sk, pkts_acked);
+ } else {
+ int delta;
+
+ /* Non-retransmitted hole got filled? That's reordering */
+ if (reord < prior_fackets)
+ tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+
+ delta = tcp_is_fack(tp) ? pkts_acked :
+ prior_sacked - tp->sacked_out;
+ tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
+ }
+
+ tp->fackets_out -= min(pkts_acked, tp->fackets_out);
+
+ if (ca_ops->pkts_acked) {
+ long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
+ ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+ }
+
+ } else if (skb && rtt_update && sack_rtt_us >= 0 &&
+ sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ tcp_rearm_rto(sk);
+ }
+
+#if FASTRETRANS_DEBUG > 0
+ WARN_ON((int)tp->sacked_out < 0);
+ WARN_ON((int)tp->lost_out < 0);
+ WARN_ON((int)tp->retrans_out < 0);
+ if (!tp->packets_out && tcp_is_sack(tp)) {
+ icsk = inet_csk(sk);
+ if (tp->lost_out) {
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
+ tp->lost_out = 0;
+ }
+ if (tp->sacked_out) {
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
+ tp->sacked_out = 0;
+ }
+ if (tp->retrans_out) {
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
+ tp->retrans_out = 0;
+ }
+ }
+#endif
+ return flag;
+}
+
+static void tcp_ack_probe(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* Was it a usable window open? */
+
+ if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+ icsk->icsk_backoff = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
+ /* Socket must be waked up by subsequent tcp_data_snd_check().
+ * This function is not for random using!
+ */
+ } else {
+ unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ when, TCP_RTO_MAX);
+ }
+}
+
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
+{
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
+}
+
+/* Decide wheather to run the increase function of congestion control. */
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+{
+ if (tcp_in_cwnd_reduction(sk))
+ return false;
+
+ /* If reordering is high then always grow cwnd whenever data is
+ * delivered regardless of its ordering. Otherwise stay conservative
+ * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
+ * new SACK or ECE mark may first advance cwnd here and later reduce
+ * cwnd in tcp_fastretrans_alert() based on more states.
+ */
+ if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ return flag & FLAG_FORWARD_PROGRESS;
+
+ return flag & FLAG_DATA_ACKED;
+}
+
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
+ const u32 ack, const u32 ack_seq,
+ const u32 nwin)
+{
+ return after(ack, tp->snd_una) ||
+ after(ack_seq, tp->snd_wl1) ||
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
+}
+
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+ u32 delta = ack - tp->snd_una;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_acked += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->snd_una = ack;
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+ u32 delta = seq - tp->rcv_nxt;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_received += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->rcv_nxt = seq;
+}
+
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
+ u32 ack_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int flag = 0;
+ u32 nwin = ntohs(tcp_hdr(skb)->window);
+
+ if (likely(!tcp_hdr(skb)->syn))
+ nwin <<= tp->rx_opt.snd_wscale;
+
+ if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+ flag |= FLAG_WIN_UPDATE;
+ tcp_update_wl(tp, ack_seq);
+
+ if (tp->snd_wnd != nwin) {
+ tp->snd_wnd = nwin;
+
+ /* Note, it is the only place, where
+ * fast path is recovered for sending TCP.
+ */
+ tp->pred_flags = 0;
+ tcp_fast_path_check(sk);
+
+ if (nwin > tp->max_window) {
+ tp->max_window = nwin;
+ tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
+ }
+ }
+ }
+
+ tcp_snd_una_update(tp, ack);
+
+ return flag;
+}
+
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
+{
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 now;
+
+ /* First check our per-socket dupack rate limit. */
+ if (tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+ &tp->last_oow_ack_time))
+ return;
+
+ /* Then check the check host-wide RFC 5961 rate limit. */
+ now = jiffies / HZ;
+ if (now != challenge_timestamp) {
+ challenge_timestamp = now;
+ challenge_count = 0;
+ }
+ if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
+}
+
+static void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = get_seconds();
+}
+
+static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
+ */
+
+ if (tcp_paws_check(&tp->rx_opt, 0))
+ tcp_store_ts_recent(tp);
+ }
+}
+
+/* This routine deals with acks during a TLP episode.
+ * We mark the end of a TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (before(ack, tp->tlp_high_seq))
+ return;
+
+ if (flag & FLAG_DSACKING_ACK) {
+ /* This DSACK means original and TLP probe arrived; no loss */
+ tp->tlp_high_seq = 0;
+ } else if (after(ack, tp->tlp_high_seq)) {
+ /* ACK advances: there was a loss, so reduce cwnd. Reset
+ * tlp_high_seq in tcp_init_cwnd_reduction()
+ */
+ tcp_init_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ tcp_end_cwnd_reduction(sk);
+ tcp_try_keep_open(sk);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
+ } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ /* Pure dupack: original and TLP probe arrived; no loss */
+ tp->tlp_high_seq = 0;
+ }
+}
+
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->in_ack_event)
+ icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_una = tp->snd_una;
+ u32 ack_seq = TCP_SKB_CB(skb)->seq;
+ u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ bool is_dupack = false;
+ u32 prior_fackets;
+ int prior_packets = tp->packets_out;
+ const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ int acked = 0; /* Number of packets newly acked */
+ long sack_rtt_us = -1L;
+
+ /* We very likely will need to access write queue head. */
+ prefetchw(sk->sk_write_queue.next);
+
+ /* If the ack is older than previous acks
+ * then we can probably ignore it.
+ */
+ if (before(ack, prior_snd_una)) {
+ /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+ if (before(ack, prior_snd_una - tp->max_window)) {
+ tcp_send_challenge_ack(sk, skb);
+ return -1;
+ }
+ goto old_ack;
+ }
+
+ /* If the ack includes data we haven't sent yet, discard
+ * this segment (RFC793 Section 3.9).
+ */
+ if (after(ack, tp->snd_nxt))
+ goto invalid_ack;
+
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ tcp_rearm_rto(sk);
+
+ if (after(ack, prior_snd_una)) {
+ flag |= FLAG_SND_UNA_ADVANCED;
+ icsk->icsk_retransmits = 0;
+ }
+
+ prior_fackets = tp->fackets_out;
+
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ if (flag & FLAG_UPDATE_TS_RECENT)
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+ if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ /* Window is constant, pure forward advance.
+ * No more checks are required.
+ * Note, we use the fact that SND.UNA>=SND.WL2.
+ */
+ tcp_update_wl(tp, ack_seq);
+ tcp_snd_una_update(tp, ack);
+ flag |= FLAG_WIN_UPDATE;
+
+ tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ } else {
+ u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
+ if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+ flag |= FLAG_DATA;
+ else
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+
+ flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+
+ if (TCP_SKB_CB(skb)->sacked)
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt_us);
+
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ flag |= FLAG_ECE;
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+ tcp_in_ack_event(sk, ack_ev_flags);
+ }
+
+ /* We passed data and got it acked, remove any soft error
+ * log. Something worked...
+ */
+ sk->sk_err_soft = 0;
+ icsk->icsk_probes_out = 0;
+ tp->rcv_tstamp = tcp_time_stamp;
+ if (!prior_packets)
+ goto no_queue;
+
+ /* See if we can take anything off of the retransmit queue. */
+ acked = tp->packets_out;
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ sack_rtt_us);
+ acked -= tp->packets_out;
+
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+
+ if (tcp_ack_is_dubious(sk, flag)) {
+ is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
+ }
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
+ struct dst_entry *dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+ }
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS)
+ tcp_schedule_loss_probe(sk);
+ tcp_update_pacing_rate(sk);
+ return 1;
+
+no_queue:
+ /* If data was DSACKed, see if we can undo a cwnd reduction. */
+ if (flag & FLAG_DSACKING_ACK)
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
+ /* If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
+ */
+ if (tcp_send_head(sk))
+ tcp_ack_probe(sk);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+ return 1;
+
+invalid_ack:
+ SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return -1;
+
+old_ack:
+ /* If data was SACKed, tag it and see if we should send more data.
+ * If data was DSACKed, see if we can undo a cwnd reduction.
+ */
+ if (TCP_SKB_CB(skb)->sacked) {
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt_us);
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
+ }
+
+ SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return 0;
+}
+
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+ bool syn, struct tcp_fastopen_cookie *foc,
+ bool exp_opt)
+{
+ /* Valid only in SYN or SYN-ACK with an even length. */
+ if (!foc || !syn || len < 0 || (len & 1))
+ return;
+
+ if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+ len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, cookie, len);
+ else if (len != 0)
+ len = -1;
+ foc->len = len;
+ foc->exp = exp_opt;
+}
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx, int estab,
+ struct tcp_fastopen_cookie *foc)
+{
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+ ptr = (const unsigned char *)(th + 1);
+ opt_rx->saw_tstamp = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS && th->syn && !estab) {
+ u16 in_mss = get_unaligned_be16(ptr);
+ if (in_mss) {
+ if (opt_rx->user_mss &&
+ opt_rx->user_mss < in_mss)
+ in_mss = opt_rx->user_mss;
+ opt_rx->mss_clamp = in_mss;
+ }
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW && th->syn &&
+ !estab && sysctl_tcp_window_scaling) {
+ __u8 snd_wscale = *(__u8 *)ptr;
+ opt_rx->wscale_ok = 1;
+ if (snd_wscale > 14) {
+ net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ __func__,
+ snd_wscale);
+ snd_wscale = 14;
+ }
+ opt_rx->snd_wscale = snd_wscale;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if ((opsize == TCPOLEN_TIMESTAMP) &&
+ ((estab && opt_rx->tstamp_ok) ||
+ (!estab && sysctl_tcp_timestamps))) {
+ opt_rx->saw_tstamp = 1;
+ opt_rx->rcv_tsval = get_unaligned_be32(ptr);
+ opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM && th->syn &&
+ !estab && sysctl_tcp_sack) {
+ opt_rx->sack_ok = TCP_SACK_SEEN;
+ tcp_sack_reset(opt_rx);
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+ !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+ opt_rx->sack_ok) {
+ TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+ }
+ break;
+#ifdef CONFIG_TCP_MD5SIG
+ case TCPOPT_MD5SIG:
+ /*
+ * The MD5 Hash has already been
+ * checked (see tcp_v{4,6}_do_rcv()).
+ */
+ break;
+#endif
+ case TCPOPT_FASTOPEN:
+ tcp_parse_fastopen_option(
+ opsize - TCPOLEN_FASTOPEN_BASE,
+ ptr, th->syn, foc, false);
+ break;
+
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number.
+ */
+ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+ get_unaligned_be16(ptr) ==
+ TCPOPT_FASTOPEN_MAGIC)
+ tcp_parse_fastopen_option(opsize -
+ TCPOLEN_EXP_FASTOPEN_BASE,
+ ptr + 2, th->syn, foc, true);
+ break;
+
+ }
+ ptr += opsize-2;
+ length -= opsize;
+ }
+ }
+}
+EXPORT_SYMBOL(tcp_parse_options);
+
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ const __be32 *ptr = (const __be32 *)(th + 1);
+
+ if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ tp->rx_opt.saw_tstamp = 1;
+ ++ptr;
+ tp->rx_opt.rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ if (*ptr)
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+ else
+ tp->rx_opt.rcv_tsecr = 0;
+ return true;
+ }
+ return false;
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static bool tcp_fast_parse_options(const struct sk_buff *skb,
+ const struct tcphdr *th, struct tcp_sock *tp)
+{
+ /* In the spirit of fast parsing, compare doff directly to constant
+ * values. Because equality is used, short doff can be ignored here.
+ */
+ if (th->doff == (sizeof(*th) / 4)) {
+ tp->rx_opt.saw_tstamp = 0;
+ return false;
+ } else if (tp->rx_opt.tstamp_ok &&
+ th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
+ if (tcp_parse_aligned_timestamp(tp, th))
+ return true;
+ }
+
+ tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+ return true;
+}
+
+#ifdef CONFIG_TCP_STEALTH
+/* Parse only the TSVal field of the TCP Timestamp option header.
+ */
+const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th)
+{
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
+
+ /* If the TCP option is too short, we can short cut */
+ if (length < TCPOLEN_TIMESTAMP)
+ return false;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return false;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ case TCPOPT_TIMESTAMP:
+ opsize = *ptr++;
+ if (opsize != TCPOLEN_TIMESTAMP || opsize > length)
+ return false;
+ *tsval = get_unaligned_be32(ptr);
+ return true;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ return false;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_parse_tsval_option);
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * Parse MD5 Signature option
+ */
+const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+{
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
+
+ /* If the TCP option is too short, we can short cut */
+ if (length < TCPOLEN_MD5SIG)
+ return NULL;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return NULL;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ return NULL;
+ if (opcode == TCPOPT_MD5SIG)
+ return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
+#endif
+
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ u32 seq = TCP_SKB_CB(skb)->seq;
+ u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+ return (/* 1. Pure ACK with correct sequence number. */
+ (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+
+ /* 2. ... and duplicate ACK. */
+ ack == tp->snd_una &&
+
+ /* 3. ... and does not update window. */
+ !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+
+ /* 4. ... and sits in replay window. */
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
+}
+
+static inline bool tcp_paws_discard(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+ !tcp_disordered_ack(sk, skb);
+}
+
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+
+static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+ return !before(end_seq, tp->rcv_wup) &&
+ !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+}
+
+/* When we get a reset we do this. */
+void tcp_reset(struct sock *sk)
+{
+ /* We want the right error as BSD sees it (and indeed as we do). */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
+ }
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+}
+
+/*
+ * Process the FIN bit. This now behaves as it is supposed to work
+ * and the FIN takes effect when it is validly part of sequence
+ * space. Not before when we get holes.
+ *
+ * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ * (and thence onto LAST-ACK and finally, CLOSE, we never enter
+ * TIME-WAIT)
+ *
+ * If we are in FINWAIT-1, a received FIN indicates simultaneous
+ * close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+static void tcp_fin(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst;
+
+ inet_csk_schedule_ack(sk);
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(sk, SOCK_DONE);
+
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ /* Move to CLOSE_WAIT */
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst_metric(dst, RTAX_QUICKACK))
+ inet_csk(sk)->icsk_ack.pingpong = 1;
+ break;
+
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /* Received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_LAST_ACK:
+ /* RFC793: Remain in the LAST-ACK state. */
+ break;
+
+ case TCP_FIN_WAIT1:
+ /* This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ */
+ tcp_send_ack(sk);
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_send_ack(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
+ */
+ pr_err("%s: Impossible, sk->sk_state=%d\n",
+ __func__, sk->sk_state);
+ break;
+ }
+
+ /* It _is_ possible, that we have something out-of-order _after_ FIN.
+ * Probably, we should reset in this case. For now drop them.
+ */
+ __skb_queue_purge(&tp->out_of_order_queue);
+ if (tcp_is_sack(tp))
+ tcp_sack_reset(&tp->rx_opt);
+ sk_mem_reclaim(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+
+ /* Do not send POLL_HUP for half duplex close. */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ }
+}
+
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+ u32 end_seq)
+{
+ if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+ if (before(seq, sp->start_seq))
+ sp->start_seq = seq;
+ if (after(end_seq, sp->end_seq))
+ sp->end_seq = end_seq;
+ return true;
+ }
+ return false;
+}
+
+static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ int mib_idx;
+
+ if (before(seq, tp->rcv_nxt))
+ mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
+ else
+ mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->rx_opt.dsack = 1;
+ tp->duplicate_sack[0].start_seq = seq;
+ tp->duplicate_sack[0].end_seq = end_seq;
+ }
+}
+
+static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->rx_opt.dsack)
+ tcp_dsack_set(sk, seq, end_seq);
+ else
+ tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+
+static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ tcp_enter_quickack_mode(sk);
+
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+ end_seq = tp->rcv_nxt;
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
+ }
+ }
+
+ tcp_send_ack(sk);
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+ int this_sack;
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ struct tcp_sack_block *swalk = sp + 1;
+
+ /* See if the recent change to the first SACK eats into
+ * or hits the sequence space of other SACK blocks, if so coalesce.
+ */
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
+ if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+ int i;
+
+ /* Zap SWALK, by moving every further SACK up by one slot.
+ * Decrease num_sacks.
+ */
+ tp->rx_opt.num_sacks--;
+ for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
+ sp[i] = sp[i + 1];
+ continue;
+ }
+ this_sack++, swalk++;
+ }
+}
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int cur_sacks = tp->rx_opt.num_sacks;
+ int this_sack;
+
+ if (!cur_sacks)
+ goto new_sack;
+
+ for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
+ if (tcp_sack_extend(sp, seq, end_seq)) {
+ /* Rotate this_sack to the first one. */
+ for (; this_sack > 0; this_sack--, sp--)
+ swap(*sp, *(sp - 1));
+ if (cur_sacks > 1)
+ tcp_sack_maybe_coalesce(tp);
+ return;
+ }
+ }
+
+ /* Could not find an adjacent existing SACK, build a new one,
+ * put it at the front, and shift everyone else down. We
+ * always know there is at least one SACK present already here.
+ *
+ * If the sack array is full, forget about the last one.
+ */
+ if (this_sack >= TCP_NUM_SACKS) {
+ this_sack--;
+ tp->rx_opt.num_sacks--;
+ sp--;
+ }
+ for (; this_sack > 0; this_sack--, sp--)
+ *sp = *(sp - 1);
+
+new_sack:
+ /* Build the new head SACK, and we're done. */
+ sp->start_seq = seq;
+ sp->end_seq = end_seq;
+ tp->rx_opt.num_sacks++;
+}
+
+/* RCV.NXT advances, some SACKs should be eaten. */
+
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int num_sacks = tp->rx_opt.num_sacks;
+ int this_sack;
+
+ /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+ if (skb_queue_empty(&tp->out_of_order_queue)) {
+ tp->rx_opt.num_sacks = 0;
+ return;
+ }
+
+ for (this_sack = 0; this_sack < num_sacks;) {
+ /* Check if the start of the sack is covered by RCV.NXT. */
+ if (!before(tp->rcv_nxt, sp->start_seq)) {
+ int i;
+
+ /* RCV.NXT must cover all the block! */
+ WARN_ON(before(tp->rcv_nxt, sp->end_seq));
+
+ /* Zap this SACK, by moving forward any other SACKS. */
+ for (i = this_sack+1; i < num_sacks; i++)
+ tp->selective_acks[i-1] = tp->selective_acks[i];
+ num_sacks--;
+ continue;
+ }
+ this_sack++;
+ sp++;
+ }
+ tp->rx_opt.num_sacks = num_sacks;
+}
+
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+ return true;
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 dsack_high = tp->rcv_nxt;
+ struct sk_buff *skb, *tail;
+ bool fragstolen, eaten;
+
+ while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+
+ if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+ __u32 dsack = dsack_high;
+ if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+ dsack_high = TCP_SKB_CB(skb)->end_seq;
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+ }
+
+ __skb_unlink(skb, &tp->out_of_order_queue);
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ SOCK_DEBUG(sk, "ofo packet was already received\n");
+ __kfree_skb(skb);
+ continue;
+ }
+ SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ if (!eaten)
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
+ }
+}
+
+static bool tcp_prune_ofo_queue(struct sock *sk);
+static int tcp_prune_queue(struct sock *sk);
+
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ unsigned int size)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ !sk_rmem_schedule(sk, skb, size)) {
+
+ if (tcp_prune_queue(sk) < 0)
+ return -1;
+
+ if (!sk_rmem_schedule(sk, skb, size)) {
+ if (!tcp_prune_ofo_queue(sk))
+ return -1;
+
+ if (!sk_rmem_schedule(sk, skb, size))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb1;
+ u32 seq, end_seq;
+
+ tcp_ecn_check_ce(tp, skb);
+
+ if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+ inet_csk_schedule_ack(sk);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+ skb1 = skb_peek_tail(&tp->out_of_order_queue);
+ if (!skb1) {
+ /* Initial out of order segment, build 1 SACK. */
+ if (tcp_is_sack(tp)) {
+ tp->rx_opt.num_sacks = 1;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq =
+ TCP_SKB_CB(skb)->end_seq;
+ }
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ goto end;
+ }
+
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (seq == TCP_SKB_CB(skb1)->end_seq) {
+ bool fragstolen;
+
+ if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ } else {
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ }
+
+ if (!tp->rx_opt.num_sacks ||
+ tp->selective_acks[0].end_seq != seq)
+ goto add_sack;
+
+ /* Common case: data arrive in order after hole. */
+ tp->selective_acks[0].end_seq = end_seq;
+ goto end;
+ }
+
+ /* Find place to insert this segment. */
+ while (1) {
+ if (!after(TCP_SKB_CB(skb1)->seq, seq))
+ break;
+ if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+ skb1 = NULL;
+ break;
+ }
+ skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+ }
+
+ /* Do skb overlap to previous one? */
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ if (skb_queue_is_first(&tp->out_of_order_queue,
+ skb1))
+ skb1 = NULL;
+ else
+ skb1 = skb_queue_prev(
+ &tp->out_of_order_queue,
+ skb1);
+ }
+ }
+ if (!skb1)
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ else
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+
+ /* And clean segments covered by new one as whole. */
+ while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+ skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ break;
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ __skb_unlink(skb1, &tp->out_of_order_queue);
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb1);
+ }
+
+add_sack:
+ if (tcp_is_sack(tp))
+ tcp_sack_new_ofo_skb(sk, seq, end_seq);
+end:
+ if (skb) {
+ tcp_grow_window(sk, skb);
+ skb_set_owner_r(skb, sk);
+ }
+}
+
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ __skb_pull(skb, hdrlen);
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
+ if (!eaten) {
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb;
+ bool fragstolen;
+
+ if (size == 0)
+ return 0;
+
+ skb = alloc_skb(size, sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto err_free;
+
+ if (memcpy_from_msg(skb_put(skb, size), msg, size))
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_TCP_STEALTH
+static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 hash;
+ __be32 seq = cpu_to_be32(TCP_SKB_CB(skb)->seq - 1);
+ char *data = skb->data + th->doff * 4;
+ int len = skb->len - th->doff * 4;
+
+ if (len < tp->stealth.integrity_len)
+ return 1;
+
+ if (tcp_stealth_integrity(&hash, tp->stealth.secret, data,
+ tp->stealth.integrity_len))
+ return 1;
+
+ if (be32_isn_to_be16_ih(seq) != cpu_to_be16(hash))
+ return 1;
+
+ tp->stealth.mode &= ~TCP_STEALTH_MODE_INTEGRITY_LEN;
+ return 0;
+}
+#endif
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int eaten = -1;
+ bool fragstolen = false;
+
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+ goto drop;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+ __tcp_stealth_integrity_check(sk, skb)) {
+ tcp_reset(sk);
+ goto drop;
+ }
+#endif
+
+ skb_dst_drop(skb);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+
+ tcp_ecn_accept_cwr(tp, skb);
+
+ tp->rx_opt.dsack = 0;
+
+ /* Queue data for delivery to the user.
+ * Packets in sequence go to the receive queue.
+ * Out of sequence packets to the out_of_order_queue.
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (tcp_receive_window(tp) == 0)
+ goto out_of_window;
+
+ /* Ok. In sequence. In window. */
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
+ sock_owned_by_user(sk) && !tp->urg_data) {
+ int chunk = min_t(unsigned int, skb->len,
+ tp->ucopy.len);
+
+ __set_current_state(TASK_RUNNING);
+
+ local_bh_enable();
+ if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ eaten = (chunk == skb->len);
+ tcp_rcv_space_adjust(sk);
+ }
+ local_bh_disable();
+ }
+
+ if (eaten <= 0) {
+queue_and_out:
+ if (eaten < 0 &&
+ tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ }
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ if (skb->len)
+ tcp_event_data_recv(sk, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ tcp_ofo_queue(sk);
+
+ /* RFC2581. 4.2. SHOULD send immediate ACK, when
+ * gap in queue is filled.
+ */
+ if (skb_queue_empty(&tp->out_of_order_queue))
+ inet_csk(sk)->icsk_ack.pingpong = 0;
+ }
+
+ if (tp->rx_opt.num_sacks)
+ tcp_sack_remove(tp);
+
+ tcp_fast_path_check(sk);
+
+ if (eaten > 0)
+ kfree_skb_partial(skb, fragstolen);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return;
+ }
+
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ /* A retransmit, 2nd most common case. Force an immediate ack. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+out_of_window:
+ tcp_enter_quickack_mode(sk);
+ inet_csk_schedule_ack(sk);
+drop:
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Out of window. F.e. zero window probe. */
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ goto out_of_window;
+
+ tcp_enter_quickack_mode(sk);
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* Partial packet, seq < rcv_next < end_seq */
+ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+
+ /* If window is closed, drop tail of packet. But after
+ * remembering D-SACK for its head made in previous line.
+ */
+ if (!tcp_receive_window(tp))
+ goto out_of_window;
+ goto queue_and_out;
+ }
+
+ tcp_data_queue_ofo(sk, skb);
+}
+
+static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+ struct sk_buff_head *list)
+{
+ struct sk_buff *next = NULL;
+
+ if (!skb_queue_is_last(list, skb))
+ next = skb_queue_next(list, skb);
+
+ __skb_unlink(skb, list);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+
+ return next;
+}
+
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the list.
+ *
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+ struct sk_buff *head, struct sk_buff *tail,
+ u32 start, u32 end)
+{
+ struct sk_buff *skb, *n;
+ bool end_of_skbs;
+
+ /* First, check that queue is collapsible and find
+ * the point where collapsing can be useful. */
+ skb = head;
+restart:
+ end_of_skbs = true;
+ skb_queue_walk_from_safe(list, skb, n) {
+ if (skb == tail)
+ break;
+ /* No new bits? It is possible on ofo queue. */
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ skb = tcp_collapse_one(sk, skb, list);
+ if (!skb)
+ break;
+ goto restart;
+ }
+
+ /* The first skb to collapse is:
+ * - not SYN/FIN and
+ * - bloated or contains data before "start" or
+ * overlaps to the next one.
+ */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
+ (tcp_win_from_space(skb->truesize) > skb->len ||
+ before(TCP_SKB_CB(skb)->seq, start))) {
+ end_of_skbs = false;
+ break;
+ }
+
+ if (!skb_queue_is_last(list, skb)) {
+ struct sk_buff *next = skb_queue_next(list, skb);
+ if (next != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+ end_of_skbs = false;
+ break;
+ }
+ }
+
+ /* Decided to skip this, advance start seq. */
+ start = TCP_SKB_CB(skb)->end_seq;
+ }
+ if (end_of_skbs ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ return;
+
+ while (before(start, end)) {
+ int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
+ struct sk_buff *nskb;
+
+ nskb = alloc_skb(copy, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+ __skb_queue_before(list, skb, nskb);
+ skb_set_owner_r(nskb, sk);
+
+ /* Copy data, releasing collapsed skbs. */
+ while (copy > 0) {
+ int offset = start - TCP_SKB_CB(skb)->seq;
+ int size = TCP_SKB_CB(skb)->end_seq - start;
+
+ BUG_ON(offset < 0);
+ if (size > 0) {
+ size = min(copy, size);
+ if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+ BUG();
+ TCP_SKB_CB(nskb)->end_seq += size;
+ copy -= size;
+ start += size;
+ }
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ skb = tcp_collapse_one(sk, skb, list);
+ if (!skb ||
+ skb == tail ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ return;
+ }
+ }
+ }
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+ struct sk_buff *head;
+ u32 start, end;
+
+ if (!skb)
+ return;
+
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ head = skb;
+
+ for (;;) {
+ struct sk_buff *next = NULL;
+
+ if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+ next = skb_queue_next(&tp->out_of_order_queue, skb);
+ skb = next;
+
+ /* Segment is terminated when we see gap or when
+ * we are at the end of all the queue. */
+ if (!skb ||
+ after(TCP_SKB_CB(skb)->seq, end) ||
+ before(TCP_SKB_CB(skb)->end_seq, start)) {
+ tcp_collapse(sk, &tp->out_of_order_queue,
+ head, skb, start, end);
+ head = skb;
+ if (!skb)
+ break;
+ /* Start new segment */
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ } else {
+ if (before(TCP_SKB_CB(skb)->seq, start))
+ start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
+ end = TCP_SKB_CB(skb)->end_seq;
+ }
+ }
+}
+
+/*
+ * Purge the out-of-order queue.
+ * Return true if queue was pruned.
+ */
+static bool tcp_prune_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool res = false;
+
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ sk_mem_reclaim(sk);
+ res = true;
+ }
+ return res;
+}
+
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ tcp_clamp_window(sk);
+ else if (sk_under_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
+ tcp_collapse_ofo_queue(sk);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ tcp_collapse(sk, &sk->sk_receive_queue,
+ skb_peek(&sk->sk_receive_queue),
+ NULL,
+ tp->copied_seq, tp->rcv_nxt);
+ sk_mem_reclaim(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
+ /* Collapsing did not help, destructive actions follow.
+ * This must not ever occur. */
+
+ tcp_prune_ofo_queue(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
+ /* If we are really being abused, tell the caller to silently
+ * drop receive data on the floor. It will get retransmitted
+ * and hopefully then we'll have sufficient space.
+ */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
+
+ /* Massive buffer overcommit. */
+ tp->pred_flags = 0;
+ return -1;
+}
+
+static bool tcp_should_expand_sndbuf(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If the user specified a specific send buffer setting, do
+ * not modify it.
+ */
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return false;
+
+ /* If we are under global TCP memory pressure, do not expand. */
+ if (sk_under_memory_pressure(sk))
+ return false;
+
+ /* If we are under soft global TCP memory pressure, do not expand. */
+ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+ return false;
+
+ /* If we filled the congestion window, do not expand. */
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ return false;
+
+ return true;
+}
+
+/* When incoming ACK allowed to free some skb from write_queue,
+ * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
+ * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
+ */
+static void tcp_new_space(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_should_expand_sndbuf(sk)) {
+ tcp_sndbuf_expand(sk);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+
+ sk->sk_write_space(sk);
+}
+
+static void tcp_check_space(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+ sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+ /* pairs with tcp_poll() */
+ smp_mb__after_atomic();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_new_space(sk);
+ }
+}
+
+static inline void tcp_data_snd_check(struct sock *sk)
+{
+ tcp_push_pending_frames(sk);
+ tcp_check_space(sk);
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* More than one full frame received... */
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+ /* ... and right edge of window advances far enough.
+ * (tcp_recvmsg() will send ACK otherwise). Or...
+ */
+ __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ /* We ACK each frame or... */
+ tcp_in_quickack_mode(sk) ||
+ /* We have out of order data. */
+ (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+ /* Then ack it now */
+ tcp_send_ack(sk);
+ } else {
+ /* Else, send delayed ack. */
+ tcp_send_delayed_ack(sk);
+ }
+}
+
+static inline void tcp_ack_snd_check(struct sock *sk)
+{
+ if (!inet_csk_ack_scheduled(sk)) {
+ /* We sent a data segment already. */
+ return;
+ }
+ __tcp_ack_snd_check(sk, 1);
+}
+
+/*
+ * This routine is only called when we have urgent data
+ * signaled. Its the 'slow' part of tcp_urg. It could be
+ * moved inline now as tcp_urg is only called from one
+ * place. We handle URGent data wrong. We have to - as
+ * BSD still doesn't use the correction from RFC961.
+ * For 1003.1g we should support a new option TCP_STDURG to permit
+ * either form (or just set the sysctl tcp_stdurg).
+ */
+
+static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 ptr = ntohs(th->urg_ptr);
+
+ if (ptr && !sysctl_tcp_stdurg)
+ ptr--;
+ ptr += ntohl(th->seq);
+
+ /* Ignore urgent data that we've already seen and read. */
+ if (after(tp->copied_seq, ptr))
+ return;
+
+ /* Do not replay urg ptr.
+ *
+ * NOTE: interesting situation not covered by specs.
+ * Misbehaving sender may send urg ptr, pointing to segment,
+ * which we already have in ofo queue. We are not able to fetch
+ * such data and will stay in TCP_URG_NOTYET until will be eaten
+ * by recvmsg(). Seems, we are not obliged to handle such wicked
+ * situations. But it is worth to think about possibility of some
+ * DoSes using some hypothetical application level deadlock.
+ */
+ if (before(ptr, tp->rcv_nxt))
+ return;
+
+ /* Do we already have a newer (or duplicate) urgent pointer? */
+ if (tp->urg_data && !after(ptr, tp->urg_seq))
+ return;
+
+ /* Tell the world about our new urgent pointer. */
+ sk_send_sigurg(sk);
+
+ /* We may be adding urgent data when the last byte read was
+ * urgent. To do this requires some care. We cannot just ignore
+ * tp->copied_seq since we would read the last urgent byte again
+ * as data, nor can we alter copied_seq until this data arrives
+ * or we break the semantics of SIOCATMARK (and thus sockatmark())
+ *
+ * NOTE. Double Dutch. Rendering to plain English: author of comment
+ * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
+ * and expect that both A and B disappear from stream. This is _wrong_.
+ * Though this happens in BSD with high probability, this is occasional.
+ * Any application relying on this is buggy. Note also, that fix "works"
+ * only in this artificial test. Insert some normal data between A and B and we will
+ * decline of BSD again. Verdict: it is better to remove to trap
+ * buggy users.
+ */
+ if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+ !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ tp->copied_seq++;
+ if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ __kfree_skb(skb);
+ }
+ }
+
+ tp->urg_data = TCP_URG_NOTYET;
+ tp->urg_seq = ptr;
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Check if we get a new urgent pointer - normally not. */
+ if (th->urg)
+ tcp_check_urg(sk, th);
+
+ /* Do we wait for any urgent data? - normally not... */
+ if (tp->urg_data == TCP_URG_NOTYET) {
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+ th->syn;
+
+ /* Is the urgent pointer pointing into this packet? */
+ if (ptr < skb->len) {
+ u8 tmp;
+ if (skb_copy_bits(skb, ptr, &tmp, 1))
+ BUG();
+ tp->urg_data = TCP_URG_VALID | tmp;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ }
+ }
+}
+
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int chunk = skb->len - hlen;
+ int err;
+
+ local_bh_enable();
+ if (skb_csum_unnecessary(skb))
+ err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
+ else
+ err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
+
+ if (!err) {
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ tcp_rcv_space_adjust(sk);
+ }
+
+ local_bh_disable();
+ return err;
+}
+
+static __sum16 __tcp_checksum_complete_user(struct sock *sk,
+ struct sk_buff *skb)
+{
+ __sum16 result;
+
+ if (sock_owned_by_user(sk)) {
+ local_bh_enable();
+ result = __tcp_checksum_complete(skb);
+ local_bh_disable();
+ } else {
+ result = __tcp_checksum_complete(skb);
+ }
+ return result;
+}
+
+static inline bool tcp_checksum_complete_user(struct sock *sk,
+ struct sk_buff *skb)
+{
+ return !skb_csum_unnecessary(skb) &&
+ __tcp_checksum_complete_user(sk, skb);
+}
+
+/* Does PAWS and seqno based validation of an incoming segment, flags will
+ * play significant role here.
+ */
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, int syn_inerr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* RFC1323: H1. Apply PAWS check first. */
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ tcp_paws_discard(sk, skb)) {
+ if (!th->rst) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
+ }
+ /* Reset is accepted even if it did not pass PAWS. */
+ }
+
+ /* Step 1: check sequence number */
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ /* RFC793, page 37: "In all states except SYN-SENT, all reset
+ * (RST) segments are validated by checking their SEQ-fields."
+ * And page 69: "If an incoming segment is not acceptable,
+ * an acknowledgment should be sent in reply (unless the RST
+ * bit is set, if so drop the segment and return)".
+ */
+ if (!th->rst) {
+ if (th->syn)
+ goto syn_challenge;
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSEQ,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ }
+ goto discard;
+ }
+
+ /* Step 2: check RST bit */
+ if (th->rst) {
+ /* RFC 5961 3.2 :
+ * If sequence number exactly matches RCV.NXT, then
+ * RESET the connection
+ * else
+ * Send a challenge ACK
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+ tcp_reset(sk);
+ else
+ tcp_send_challenge_ack(sk, skb);
+ goto discard;
+ }
+
+ /* step 3: check security and precedence [ignored] */
+
+ /* step 4: Check for a SYN
+ * RFC 5961 4.2 : Send a challenge ack
+ */
+ if (th->syn) {
+syn_challenge:
+ if (syn_inerr)
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ tcp_send_challenge_ack(sk, skb);
+ goto discard;
+ }
+
+ return true;
+
+discard:
+ __kfree_skb(skb);
+ return false;
+}
+
+/*
+ * TCP receive function for the ESTABLISHED state.
+ *
+ * It is split into a fast path and a slow path. The fast path is
+ * disabled when:
+ * - A zero window was announced from us - zero window probing
+ * is only handled properly in the slow path.
+ * - Out of order segments arrived.
+ * - Urgent data is expected.
+ * - There is no buffer space left
+ * - Unexpected TCP flags/window values/header lengths are received
+ * (detected by checking the TCP header against pred_flags)
+ * - Data is sent in both directions. Fast path only supports pure senders
+ * or pure receivers (this means either the sequence number or the ack
+ * value must stay constant)
+ * - Unexpected TCP option.
+ *
+ * When these conditions are not satisfied it drops into a standard
+ * receive procedure patterned after RFC793 to handle all cases.
+ * The first three cases are guaranteed by proper pred_flags setting,
+ * the rest is checked inline. Fast processing is turned on in
+ * tcp_data_queue when everything is OK.
+ */
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unlikely(!sk->sk_rx_dst))
+ inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ /*
+ * Header prediction.
+ * The code loosely follows the one in the famous
+ * "30 instruction TCP receive" Van Jacobson mail.
+ *
+ * Van's trick is to deposit buffers into socket queue
+ * on a device interrupt, to call tcp_recv function
+ * on the receive process context and checksum and copy
+ * the buffer to user space. smart...
+ *
+ * Our current scheme is not silly either but we take the
+ * extra cost of the net_bh soft interrupt processing...
+ * We do checksum and copy also but from device to kernel.
+ */
+
+ tp->rx_opt.saw_tstamp = 0;
+
+ /* pred_flags is 0xS?10 << 16 + snd_wnd
+ * if header_prediction is to be made
+ * 'S' will always be tp->tcp_header_len >> 2
+ * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+ * turn it off (when there are holes in the receive
+ * space for instance)
+ * PSH flag is ignored.
+ */
+
+ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+ !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ int tcp_header_len = tp->tcp_header_len;
+
+ /* Timestamp header prediction: tcp_header_len
+ * is automatically equal to th->doff*4 due to pred_flags
+ * match.
+ */
+
+ /* Check timestamp */
+ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+ /* No? Slow path! */
+ if (!tcp_parse_aligned_timestamp(tp, th))
+ goto slow_path;
+
+ /* If PAWS failed, check it more carefully in slow path */
+ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ goto slow_path;
+
+ /* DO NOT update ts_recent here, if checksum fails
+ * and timestamp was corrupted part, it will result
+ * in a hung connection since we will drop all
+ * future packets due to the PAWS test.
+ */
+ }
+
+ if (len <= tcp_header_len) {
+ /* Bulk data transfer: sender */
+ if (len == tcp_header_len) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ /* We know that such packets are checksummed
+ * on entry.
+ */
+ tcp_ack(sk, skb, 0);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return;
+ } else { /* Header too small */
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ goto discard;
+ }
+ } else {
+ int eaten = 0;
+ bool fragstolen = false;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (unlikely(tp->stealth.mode &
+ TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+ __tcp_stealth_integrity_check(sk, skb)) {
+ tcp_reset(sk);
+ goto discard;
+ }
+#endif
+
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len &&
+ sock_owned_by_user(sk)) {
+ __set_current_state(TASK_RUNNING);
+
+ if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) +
+ TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ __skb_pull(skb, tcp_header_len);
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ eaten = 1;
+ }
+ }
+ if (!eaten) {
+ if (tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+ /* Bulk data transfer: receiver */
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
+ }
+
+ tcp_event_data_recv(sk, skb);
+
+ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+ /* Well, only one small jumplet in fast path... */
+ tcp_ack(sk, skb, FLAG_DATA);
+ tcp_data_snd_check(sk);
+ if (!inet_csk_ack_scheduled(sk))
+ goto no_ack;
+ }
+
+ __tcp_ack_snd_check(sk, 0);
+no_ack:
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
+ sk->sk_data_ready(sk);
+ return;
+ }
+ }
+
+slow_path:
+ if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ if (!th->ack && !th->rst && !th->syn)
+ goto discard;
+
+ /*
+ * Standard slow path.
+ */
+
+ if (!tcp_validate_incoming(sk, skb, th, 1))
+ return;
+
+step5:
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
+ goto discard;
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ /* Process urgent data. */
+ tcp_urg(sk, skb, th);
+
+ /* step 7: process the segment text */
+ tcp_data_queue(sk, skb);
+
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ return;
+
+csum_error:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+
+discard:
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(tcp_rcv_established);
+
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb) {
+ icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ security_inet_conn_established(sk, skb);
+ }
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+ u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+ bool syn_drop = false;
+
+ if (mss == tp->rx_opt.user_mss) {
+ struct tcp_options_received opt;
+
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+ tcp_parse_options(synack, &opt, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+ if (!tp->syn_fastopen) {
+ /* Ignore an unsolicited cookie */
+ cookie->len = -1;
+ } else if (tp->total_retrans) {
+ /* SYN timed out and the SYN-ACK neither has a cookie nor
+ * acknowledges data. Presumably the remote received only
+ * the retransmitted (regular) SYNs: either the original
+ * SYN-data or the corresponding SYN-ACK was dropped.
+ */
+ syn_drop = (cookie->len < 0 && data);
+ } else if (cookie->len < 0 && !tp->syn_data) {
+ /* We requested a cookie but didn't get it. If we did not use
+ * the (old) exp opt format then try so next time (try_exp=1).
+ * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+ */
+ try_exp = tp->syn_fastopen_exp ? 2 : 1;
+ }
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
+
+ if (data) { /* Retransmit unacked data in SYN */
+ tcp_for_write_queue_from(data, sk) {
+ if (data == tcp_send_head(sk) ||
+ __tcp_retransmit_skb(sk, data))
+ break;
+ }
+ tcp_rearm_rto(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ return true;
+ }
+ tp->syn_data_acked = tp->syn_data;
+ if (tp->syn_data_acked)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ return false;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int saved_clamp = tp->rx_opt.mss_clamp;
+
+ tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+ if (th->ack) {
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ */
+ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ goto reset_and_undo;
+
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+ tcp_time_stamp)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
+ goto reset_and_undo;
+ }
+
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+ if (!th->syn)
+ goto discard_and_undo;
+
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ */
+
+ tcp_ecn_rcv_synack(tp, th);
+
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = ntohs(th->window);
+
+ if (!tp->rx_opt.wscale_ok) {
+ tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+ tp->window_clamp = min(tp->window_clamp, 65535U);
+ }
+
+ if (tp->rx_opt.saw_tstamp) {
+ tp->rx_opt.tstamp_ok = 1;
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+ tcp_store_ts_recent(tp);
+ } else {
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
+ if (tcp_is_sack(tp) && sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+
+ tcp_mtup_init(sk);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ /* Remember, tcp_poll() does not lock socket!
+ * Change state from SYN-SENT only after copied_seq
+ * is initialized. */
+ tp->copied_seq = tp->rcv_nxt;
+
+ smp_mb();
+
+ tcp_finish_connect(sk, skb);
+
+ if ((tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc))
+ return -1;
+
+ if (sk->sk_write_pending ||
+ icsk->icsk_accept_queue.rskq_defer_accept ||
+ icsk->icsk_ack.pingpong) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
+ */
+ inet_csk_schedule_ack(sk);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+ tcp_enter_quickack_mode(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
+
+discard:
+ __kfree_skb(skb);
+ return 0;
+ } else {
+ tcp_send_ack(sk);
+ }
+ return -1;
+ }
+
+ /* No ACK in the segment */
+
+ if (th->rst) {
+ /* rfc793:
+ * "If the RST bit is set
+ *
+ * Otherwise (no ACK) drop the segment and return."
+ */
+
+ goto discard_and_undo;
+ }
+
+ /* PAWS check. */
+ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
+ tcp_paws_reject(&tp->rx_opt, 0))
+ goto discard_and_undo;
+
+ if (th->syn) {
+ /* We see SYN without ACK. It is attempt of
+ * simultaneous connect with crossed SYNs.
+ * Particularly, it can be connect to self.
+ */
+ tcp_set_state(sk, TCP_SYN_RECV);
+
+ if (tp->rx_opt.saw_tstamp) {
+ tp->rx_opt.tstamp_ok = 1;
+ tcp_store_ts_recent(tp);
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = ntohs(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->max_window = tp->snd_wnd;
+
+ tcp_ecn_rcv_syn(tp, th);
+
+ tcp_mtup_init(sk);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ tcp_send_synack(sk);
+#if 0
+ /* Note, we could accept data and URG from this segment.
+ * There are no obstacles to make this (except that we must
+ * either change tcp_recvmsg() to prevent it from returning data
+ * before 3WHS completes per RFC793, or employ TCP Fast Open).
+ *
+ * However, if we ignore data in ACKless segments sometimes,
+ * we have no reasons to accept it sometimes.
+ * Also, seems the code doing it in step6 of tcp_rcv_state_process
+ * is not flawless. So, discard packet for sanity.
+ * Uncomment this return to process the data.
+ */
+ return -1;
+#else
+ goto discard;
+#endif
+ }
+ /* "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ */
+
+discard_and_undo:
+ tcp_clear_options(&tp->rx_opt);
+ tp->rx_opt.mss_clamp = saved_clamp;
+ goto discard;
+
+reset_and_undo:
+ tcp_clear_options(&tp->rx_opt);
+ tp->rx_opt.mss_clamp = saved_clamp;
+ return 1;
+}
+
+/*
+ * This function implements the receiving procedure of RFC 793 for
+ * all states except ESTABLISHED and TIME_WAIT.
+ * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ * address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
+ int queued = 0;
+ bool acceptable;
+ u32 synack_stamp;
+
+ tp->rx_opt.saw_tstamp = 0;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ goto discard;
+
+ case TCP_LISTEN:
+ if (th->ack)
+ return 1;
+
+ if (th->rst)
+ goto discard;
+
+ if (th->syn) {
+ if (th->fin)
+ goto discard;
+ if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
+ return 1;
+
+ /* Now we have several options: In theory there is
+ * nothing else in the frame. KA9Q has an option to
+ * send data with the syn, BSD accepts data with the
+ * syn up to the [to be] advertised window and
+ * Solaris 2.1 gives you a protocol error. For now
+ * we just ignore it, that fits the spec precisely
+ * and avoids incompatibilities. It would be nice in
+ * future to drop through and process the data.
+ *
+ * Now that TTCP is starting to be used we ought to
+ * queue this data.
+ * But, this leaves one open to an easy denial of
+ * service attack, and SYN cookies can't defend
+ * against this problem. So, we drop the data
+ * in the interest of security over speed unless
+ * it's still in use.
+ */
+ kfree_skb(skb);
+ return 0;
+ }
+ goto discard;
+
+ case TCP_SYN_SENT:
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ if (queued >= 0)
+ return queued;
+
+ /* Do step6 onward by hand. */
+ tcp_urg(sk, skb, th);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return 0;
+ }
+
+ req = tp->fastopen_rsk;
+ if (req) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+
+ if (!tcp_check_req(sk, skb, req, true))
+ goto discard;
+ }
+
+ if (!th->ack && !th->rst && !th->syn)
+ goto discard;
+
+ if (!tcp_validate_incoming(sk, skb, th, 0))
+ return 0;
+
+ /* step 5: check the ACK field */
+ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT) > 0;
+
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ if (!acceptable)
+ return 1;
+
+ /* Once we leave TCP_SYN_RECV, we no longer need req
+ * so release it.
+ */
+ if (req) {
+ synack_stamp = tcp_rsk(req)->snt_synack;
+ tp->total_retrans = req->num_retrans;
+ reqsk_fastopen_remove(sk, req, false);
+ } else {
+ synack_stamp = tp->lsndtime;
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_congestion_control(sk);
+
+ tcp_mtup_init(sk);
+ tp->copied_seq = tp->rcv_nxt;
+ tcp_init_buffer_space(sk);
+ }
+ smp_mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->sk_state_change(sk);
+
+ /* Note, that this wakeup is only for marginal crossed SYN case.
+ * Passively open sockets are not waked up, because
+ * sk->sk_sleep == NULL and sk->sk_socket == NULL.
+ */
+ if (sk->sk_socket)
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_synack_rtt_meas(sk, synack_stamp);
+
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+ if (req) {
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
+ } else
+ tcp_init_metrics(sk);
+
+ tcp_update_pacing_rate(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data packet */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_initialize_rcv_mss(sk);
+ tcp_fast_path_on(tp);
+ break;
+
+ case TCP_FIN_WAIT1: {
+ struct dst_entry *dst;
+ int tmo;
+
+ /* If we enter the TCP_FIN_WAIT1 state and we are a
+ * Fast Open socket and this is the first acceptable
+ * ACK we have received, this would have acknowledged
+ * our SYNACK so stop the SYNACK timer.
+ */
+ if (req) {
+ /* Return RST if ack_seq is invalid.
+ * Note that RFC793 only says to generate a
+ * DUPACK for it but for TCP Fast Open it seems
+ * better to treat this case like TCP_SYN_RECV
+ * above.
+ */
+ if (!acceptable)
+ return 1;
+ /* We no longer need the request sock. */
+ reqsk_fastopen_remove(sk, req, false);
+ tcp_rearm_rto(sk);
+ }
+ if (tp->snd_una != tp->write_seq)
+ break;
+
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Wake up lingering close() */
+ sk->sk_state_change(sk);
+ break;
+ }
+
+ if (tp->linger2 < 0 ||
+ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return 1;
+ }
+
+ tmo = tcp_fin_time(sk);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ inet_csk_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ break;
+ }
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ goto discard;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ goto discard;
+ }
+ break;
+ }
+
+ /* step 6: check the URG bit */
+ tcp_urg(sk, skb, th);
+
+ /* step 7: process the segment text */
+ switch (sk->sk_state) {
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ /* RFC 793 says to queue data in these states,
+ * RFC 1122 says we MUST send a reset.
+ * BSD 4.4 also does reset.
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ tcp_reset(sk);
+ return 1;
+ }
+ }
+ /* Fall through */
+ case TCP_ESTABLISHED:
+ tcp_data_queue(sk, skb);
+ queued = 1;
+ break;
+ }
+
+ /* tcp_data could move socket to TIME-WAIT */
+ if (sk->sk_state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
+
+ if (!queued) {
+discard:
+ __kfree_skb(skb);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (family == AF_INET)
+ net_dbg_ratelimited("drop open request from %pI4/%u\n",
+ &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ net_dbg_ratelimited("drop open request from %pI6/%u\n",
+ &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negotiation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control: Linux DCTCP asserts ECT on all packets,
+ * including SYN, which is most optimal solution; however,
+ * others, such as FreeBSD do not.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+ const struct sk_buff *skb,
+ const struct sock *listen_sk,
+ const struct dst_entry *dst)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, ecn_ok;
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+
+ if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+ inet_rsk(req)->ecn_ok = 1;
+}
+
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->cookie_ts = 0;
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+}
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ kmemcheck_annotate_bitfield(ireq, flags);
+ ireq->opt = NULL;
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+/*
+ * Return true if a syncookie should be sent
+ */
+static bool tcp_syn_flood_action(struct sock *sk,
+ const struct sk_buff *skb,
+ const char *proto)
+{
+ const char *msg = "Dropping request";
+ bool want_cookie = false;
+ struct listen_sock *lopt;
+
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies) {
+ msg = "Sending cookies";
+ want_cookie = true;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ } else
+#endif
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+
+ lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
+ if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
+ lopt->synflood_warned = 1;
+ pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
+ proto, ntohs(tcp_hdr(skb)->dest), msg);
+ }
+ return want_cookie;
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_options_received tmp_opt;
+ struct request_sock *req;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = NULL;
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
+ bool want_cookie = false, fastopen;
+ struct flowi fl;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int err;
+
+
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if ((sysctl_tcp_syncookies == 2 ||
+ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
+
+
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ req = inet_reqsk_alloc(rsk_ops, sk);
+ if (!req)
+ goto drop;
+
+ tcp_rsk(req)->af_specific = af_ops;
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = af_ops->mss_clamp;
+ tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+ if (want_cookie && !tmp_opt.saw_tstamp)
+ tcp_clear_options(&tmp_opt);
+
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+ tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+ /* Note: tcp_v6_init_req() might override ir_iif for link locals */
+ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+
+ af_ops->init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ if (!want_cookie && !isn) {
+ /* VJ's idea. We save last timestamp seen
+ * from the destination in peer table, when entering
+ * state TIME-WAIT, and check against it before
+ * accepting new connection request.
+ *
+ * If "isn" is not zero, this request hit alive
+ * timewait bucket, so that all the necessary checks
+ * are made in the function processing timewait state.
+ */
+ if (tcp_death_row.sysctl_tw_recycle) {
+ bool strict;
+
+ dst = af_ops->route_req(sk, &fl, req, &strict);
+
+ if (dst && strict &&
+ !tcp_peer_is_proven(req, dst, true,
+ tmp_opt.saw_tstamp)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ goto drop_and_release;
+ }
+ }
+ /* Kill the following clause, if you dislike this way. */
+ else if (!sysctl_tcp_syncookies &&
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (sysctl_max_syn_backlog >> 2)) &&
+ !tcp_peer_is_proven(req, dst, false,
+ tmp_opt.saw_tstamp)) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations,
+ * proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+ rsk_ops->family);
+ goto drop_and_release;
+ }
+
+ isn = af_ops->init_seq(skb);
+ }
+ if (!dst) {
+ dst = af_ops->route_req(sk, &fl, req, NULL);
+ if (!dst)
+ goto drop_and_free;
+ }
+
+ tcp_ecn_create_request(req, skb, sk, dst);
+
+ if (want_cookie) {
+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
+ if (!tmp_opt.tstamp_ok)
+ inet_rsk(req)->ecn_ok = 0;
+ }
+
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = af_ops->send_synack(sk, dst, &fl, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
+ if (err || want_cookie)
+ goto drop_and_free;
+
+ tcp_rsk(req)->tfo_listener = false;
+ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ }
+
+ return 0;
+
+drop_and_release:
+ dst_release(dst);
+drop_and_free:
+ reqsk_free(req);
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 000000000..1e0ce4d7b
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2468 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * IPv4 specific functions
+ *
+ *
+ * code split from:
+ * linux/ipv4/tcp.c
+ * linux/ipv4/tcp_input.c
+ * linux/ipv4/tcp_output.c
+ *
+ * See tcp.c for author information
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ * David S. Miller : New socket lookup architecture.
+ * This code is dedicated to John Dyson.
+ * David S. Miller : Change semantics of established hash,
+ * half is devoted to TIME_WAIT sockets
+ * and the rest go in the other half.
+ * Andi Kleen : Add support for syncookies and fixed
+ * some bugs: ip options weren't passed to
+ * the TCP layer, missed a check for an
+ * ACK bit.
+ * Andi Kleen : Implemented fast path mtu discovery.
+ * Fixed many serious bugs in the
+ * request_sock handling and moved
+ * most of it into the af independent code.
+ * Added tail drop and some other bugfixes.
+ * Added new listen semantics.
+ * Mike McLagan : Routing by source
+ * Juan Jose Ciarlante: ip_dynaddr bits
+ * Andi Kleen: various fixes.
+ * Vitaly E. Lavrov : Transparent proxy revived after year
+ * coma.
+ * Andi Kleen : Fix new listen.
+ * Andi Kleen : Fix accept error reporting.
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/bottom_half.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp.h>
+#include <net/transp_v6.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/timewait_sock.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+#include <net/tcp_memcontrol.h>
+#include <net/busy_poll.h>
+#include <net/secure_seq.h>
+
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+int sysctl_tcp_tw_reuse __read_mostly;
+int sysctl_tcp_low_latency __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
+
+#ifdef CONFIG_TCP_MD5SIG
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
+#endif
+
+struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
+
+static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+{
+ return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
+}
+
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* With PAWS, it is safe from the viewpoint
+ of data integrity. Even without PAWS it is safe provided sequence
+ spaces do not overlap i.e. at data rates <= 80Mbit/sec.
+
+ Actually, the idea is close to VJ's one, only timestamp cache is
+ held not per host, but per port pair and TW bucket is used as state
+ holder.
+
+ If TW bucket has been already destroyed we fall back to VJ's scheme
+ and use initial timestamp retrieved from peer table.
+ */
+ if (tcptw->tw_ts_recent_stamp &&
+ (!twp || (sysctl_tcp_tw_reuse &&
+ get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ sock_hold(sktw);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __be16 orig_sport, orig_dport;
+ __be32 daddr, nexthop;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ int err;
+ struct ip_options_rcu *inet_opt;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ nexthop = inet_opt->opt.faddr;
+ }
+
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_TCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ return err;
+ }
+
+ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ if (!inet_opt || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
+
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr;
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
+
+ if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
+ /* Reset inherited state */
+ tp->rx_opt.ts_recent = 0;
+ tp->rx_opt.ts_recent_stamp = 0;
+ if (likely(!tp->repair))
+ tp->write_seq = 0;
+ }
+
+ if (tcp_death_row.sysctl_tw_recycle &&
+ !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+ tcp_fetch_timewait_stamp(sk, &rt->dst);
+
+ inet->inet_dport = usin->sin_port;
+ sk_daddr_set(sk, daddr);
+
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+
+ tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
+
+ /* Socket identity is still unknown (sport may be zero).
+ * However we set state to SYN-SENT and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initialization after this.
+ */
+ tcp_set_state(sk, TCP_SYN_SENT);
+ err = inet_hash_connect(&tcp_death_row, sk);
+ if (err)
+ goto failure;
+
+ inet_set_txhash(sk);
+
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto failure;
+ }
+ /* OK, now commit destination to socket. */
+ sk->sk_gso_type = SKB_GSO_TCPV4;
+ sk_setup_caps(sk, &rt->dst);
+
+#ifdef CONFIG_TCP_STEALTH
+ /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+ * early as possible and thus move taking the snapshot of tcp_time_stamp
+ * here.
+ */
+ skb_mstamp_get(&tp->stealth.mstamp);
+
+ if (!tp->write_seq && likely(!tp->repair) &&
+ unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+ tp->write_seq = tcp_stealth_sequence_number(sk,
+ &inet->inet_daddr,
+ sizeof(inet->inet_daddr),
+ usin->sin_port);
+#endif
+
+ if (!tp->write_seq && likely(!tp->repair))
+ tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port);
+
+ inet->inet_id = tp->write_seq ^ jiffies;
+
+ err = tcp_connect(sk);
+
+ rt = NULL;
+ if (err)
+ goto failure;
+
+ return 0;
+
+failure:
+ /*
+ * This unhashes the socket and releases the local port,
+ * if necessary.
+ */
+ tcp_set_state(sk, TCP_CLOSE);
+ ip_rt_put(rt);
+ sk->sk_route_caps = 0;
+ inet->inet_dport = 0;
+ return err;
+}
+EXPORT_SYMBOL(tcp_v4_connect);
+
+/*
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
+ */
+void tcp_v4_mtu_reduced(struct sock *sk)
+{
+ struct dst_entry *dst;
+ struct inet_sock *inet = inet_sk(sk);
+ u32 mtu = tcp_sk(sk)->mtu_info;
+
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
+ return;
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ sk->sk_err_soft = EMSGSIZE;
+
+ mtu = dst_mtu(dst);
+
+ if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
+ tcp_sync_mss(sk, mtu);
+
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+ * dropped. This is the new "fast" path mtu
+ * discovery.
+ */
+ tcp_simple_retransmit(sk);
+ } /* else let the usual retransmit timer handle it */
+}
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
+
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
+
+/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
+void tcp_req_err(struct sock *sk, u32 seq)
+{
+ struct request_sock *req = inet_reqsk(sk);
+ struct net *net = sock_net(sk);
+
+ /* ICMPs are not backlogged, hence we cannot get
+ * an established socket here.
+ */
+ WARN_ON(req->sk);
+
+ if (seq != tcp_rsk(req)->snt_isn) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ reqsk_put(req);
+ } else {
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ }
+}
+EXPORT_SYMBOL(tcp_req_err);
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code. After adjustment
+ * header points to the first 8 bytes of the tcp header. We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+
+void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
+ struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+ struct inet_sock *inet;
+ const int type = icmp_hdr(icmp_skb)->type;
+ const int code = icmp_hdr(icmp_skb)->code;
+ struct sock *sk;
+ struct sk_buff *skb;
+ struct request_sock *fastopen;
+ __u32 seq, snd_una;
+ __u32 remaining;
+ int err;
+ struct net *net = dev_net(icmp_skb->dev);
+
+ sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
+ th->dest, iph->saddr, ntohs(th->source),
+ inet_iif(icmp_skb));
+ if (!sk) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return;
+ }
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+ seq = ntohl(th->seq);
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_err(sk, seq);
+
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ * We do take care of PMTU discovery (RFC1191) special case :
+ * we can receive locally generated ICMP messages while socket is held.
+ */
+ if (sock_owned_by_user(sk)) {
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ }
+ if (sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+
+ icsk = inet_csk(sk);
+ tp = tcp_sk(sk);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
+ if (sk->sk_state != TCP_LISTEN &&
+ !between(seq, snd_una, tp->snd_nxt)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ switch (type) {
+ case ICMP_REDIRECT:
+ do_redirect(icmp_skb, sk);
+ goto out;
+ case ICMP_SOURCE_QUENCH:
+ /* Just silently ignore these. */
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ tp->mtu_info = info;
+ if (!sock_owned_by_user(sk)) {
+ tcp_v4_mtu_reduced(sk);
+ } else {
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ sock_hold(sk);
+ }
+ goto out;
+ }
+
+ err = icmp_err_convert[code].errno;
+ /* check if icmp_skb allows revert of backoff
+ * (see draft-zimmermann-tcp-lcd) */
+ if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
+ break;
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
+ !icsk->icsk_backoff || fastopen)
+ break;
+
+ if (sock_owned_by_user(sk))
+ break;
+
+ icsk->icsk_backoff--;
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
+ TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+ skb = tcp_write_queue_head(sk);
+ BUG_ON(!skb);
+
+ remaining = icsk->icsk_rto -
+ min(icsk->icsk_rto,
+ tcp_time_stamp - tcp_skb_timestamp(skb));
+
+ if (remaining) {
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ remaining, TCP_RTO_MAX);
+ } else {
+ /* RTO revert clocked out retransmission.
+ * Will retransmit now */
+ tcp_retransmit_timer(sk);
+ }
+
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * is already accepted it is treated as a connected one below.
+ */
+ if (fastopen && !fastopen->sk)
+ break;
+
+ if (!sock_owned_by_user(sk)) {
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+ } else {
+ sk->sk_err_soft = err;
+ }
+ goto out;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else { /* Only an error on timeout */
+ sk->sk_err_soft = err;
+ }
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ } else {
+ th->check = tcp_v4_check(skb->len, saddr, daddr,
+ csum_partial(th,
+ th->doff << 2,
+ skb->csum));
+ }
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+EXPORT_SYMBOL(tcp_v4_send_check);
+
+/*
+ * This routine will send an RST to the other tcp.
+ *
+ * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ * for reset.
+ * Answer: if a packet caused RST, it is not for a socket
+ * existing in our system, if it is matched to a socket,
+ * it is just duplicate segment or bug in other side's TCP.
+ * So that we build reply only basing on parameters
+ * arrived with segment.
+ * Exception: precedence violation. We do not implement it in any case.
+ */
+
+static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct {
+ struct tcphdr th;
+#ifdef CONFIG_TCP_MD5SIG
+ __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
+#endif
+ } rep;
+ struct ip_reply_arg arg;
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *key;
+ const __u8 *hash_location = NULL;
+ unsigned char newhash[16];
+ int genhash;
+ struct sock *sk1 = NULL;
+#endif
+ struct net *net;
+
+ /* Never send a reset in response to a reset. */
+ if (th->rst)
+ return;
+
+ /* If sk not NULL, it means we did a successful lookup and incoming
+ * route had to be correct. prequeue might have dropped our dst.
+ */
+ if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
+ return;
+
+ /* Swap the send and the receive. */
+ memset(&rep, 0, sizeof(rep));
+ rep.th.dest = th->source;
+ rep.th.source = th->dest;
+ rep.th.doff = sizeof(struct tcphdr) / 4;
+ rep.th.rst = 1;
+
+ if (th->ack) {
+ rep.th.seq = th->ack_seq;
+ } else {
+ rep.th.ack = 1;
+ rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+ skb->len - (th->doff << 2));
+ }
+
+ memset(&arg, 0, sizeof(arg));
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+
+ net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+#ifdef CONFIG_TCP_MD5SIG
+ hash_location = tcp_parse_md5sig_option(th);
+ if (!sk && hash_location) {
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = __inet_lookup_listener(net,
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
+ ntohs(th->source), inet_iif(skb));
+ /* don't send rst if it can't find key */
+ if (!sk1)
+ return;
+ rcu_read_lock();
+ key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr, AF_INET);
+ if (!key)
+ goto release_sk1;
+
+ genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ goto release_sk1;
+ } else {
+ key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr,
+ AF_INET) : NULL;
+ }
+
+ if (key) {
+ rep.opt[0] = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ /* Update length and the length the header thinks exists */
+ arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+ key, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &rep.th);
+ }
+#endif
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
+ arg.iov[0].iov_len, IPPROTO_TCP, 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+ /* When socket is gone, all binding information is lost.
+ * routing might fail in this case. No choice here, if we choose to force
+ * input interface, we will misroute in case of asymmetric route.
+ */
+ if (sk)
+ arg.bound_dev_if = sk->sk_bound_dev_if;
+
+ arg.tos = ip_hdr(skb)->tos;
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len);
+
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+
+#ifdef CONFIG_TCP_MD5SIG
+release_sk1:
+ if (sk1) {
+ rcu_read_unlock();
+ sock_put(sk1);
+ }
+#endif
+}
+
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+ outside socket context is ugly, certainly. What can I do?
+ */
+
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+ u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_md5sig_key *key,
+ int reply_flags, u8 tos)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct {
+ struct tcphdr th;
+ __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
+#ifdef CONFIG_TCP_MD5SIG
+ + (TCPOLEN_MD5SIG_ALIGNED >> 2)
+#endif
+ ];
+ } rep;
+ struct ip_reply_arg arg;
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ memset(&rep.th, 0, sizeof(struct tcphdr));
+ memset(&arg, 0, sizeof(arg));
+
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+ if (tsecr) {
+ rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ rep.opt[1] = htonl(tsval);
+ rep.opt[2] = htonl(tsecr);
+ arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
+ }
+
+ /* Swap the send and the receive. */
+ rep.th.dest = th->source;
+ rep.th.source = th->dest;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ rep.th.seq = htonl(seq);
+ rep.th.ack_seq = htonl(ack);
+ rep.th.ack = 1;
+ rep.th.window = htons(win);
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (key) {
+ int offset = (tsecr) ? 3 : 0;
+
+ rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+ rep.th.doff = arg.iov[0].iov_len/4;
+
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
+ key, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &rep.th);
+ }
+#endif
+ arg.flags = reply_flags;
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
+ arg.iov[0].iov_len, IPPROTO_TCP, 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ if (oif)
+ arg.bound_dev_if = oif;
+ arg.tos = tos;
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len);
+
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+}
+
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
+ tcptw->tw_ts_recent,
+ tw->tw_bound_dev_if,
+ tcp_twsk_md5_key(tcptw),
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ tw->tw_tos
+ );
+
+ inet_twsk_put(tw);
+}
+
+static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
+{
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_time_stamp,
+ req->ts_recent,
+ 0,
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+ AF_INET),
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ ip_hdr(skb)->tos);
+}
+
+/*
+ * Send a SYN-ACK after having received a SYN.
+ * This still operates on a request_sock only, not on a big
+ * socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ u16 queue_mapping,
+ struct tcp_fastopen_cookie *foc)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct flowi4 fl4;
+ int err = -1;
+ struct sk_buff *skb;
+
+ /* First, grab a route. */
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ return -1;
+
+ skb = tcp_make_synack(sk, dst, req, foc);
+
+ if (skb) {
+ __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+
+ skb_set_queue_mapping(skb, queue_mapping);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
+ ireq->opt);
+ err = net_xmit_eval(err);
+ }
+
+ return err;
+}
+
+/*
+ * IPv4 request_sock destructor.
+ */
+static void tcp_v4_reqsk_destructor(struct request_sock *req)
+{
+ kfree(inet_rsk(req)->opt);
+}
+
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * RFC2385 MD5 checksumming requires a mapping of
+ * IP address->MD5 Key.
+ * We need to maintain these in the sk structure.
+ */
+
+/* Find the Key structure for an address. */
+struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ unsigned int size = sizeof(struct in_addr);
+ const struct tcp_md5sig_info *md5sig;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
+ if (!md5sig)
+ return NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ size = sizeof(struct in6_addr);
+#endif
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ if (key->family != family)
+ continue;
+ if (!memcmp(&key->addr, addr, size))
+ return key;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_md5_do_lookup);
+
+struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
+ const struct sock *addr_sk)
+{
+ const union tcp_md5_addr *addr;
+
+ addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
+ return tcp_md5_do_lookup(sk, addr, AF_INET);
+}
+EXPORT_SYMBOL(tcp_v4_md5_lookup);
+
+/* This can be called on a newly created socket, from other files */
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
+{
+ /* Add Key to the list */
+ struct tcp_md5sig_key *key;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_info *md5sig;
+
+ key = tcp_md5_do_lookup(sk, addr, family);
+ if (key) {
+ /* Pre-existing entry - just update that one. */
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ return 0;
+ }
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info,
+ sock_owned_by_user(sk));
+ if (!md5sig) {
+ md5sig = kmalloc(sizeof(*md5sig), gfp);
+ if (!md5sig)
+ return -ENOMEM;
+
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ INIT_HLIST_HEAD(&md5sig->head);
+ rcu_assign_pointer(tp->md5sig_info, md5sig);
+ }
+
+ key = sock_kmalloc(sk, sizeof(*key), gfp);
+ if (!key)
+ return -ENOMEM;
+ if (!tcp_alloc_md5sig_pool()) {
+ sock_kfree_s(sk, key, sizeof(*key));
+ return -ENOMEM;
+ }
+
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ key->family = family;
+ memcpy(&key->addr, addr,
+ (family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
+ hlist_add_head_rcu(&key->node, &md5sig->head);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_md5_do_add);
+
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
+{
+ struct tcp_md5sig_key *key;
+
+ key = tcp_md5_do_lookup(sk, addr, family);
+ if (!key)
+ return -ENOENT;
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_md5_do_del);
+
+static void tcp_clear_md5_list(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ struct hlist_node *n;
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+
+ hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ }
+}
+
+static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ struct tcp_md5sig cmd;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ return -EFAULT;
+
+ if (sin->sin_family != AF_INET)
+ return -EINVAL;
+
+ if (!cmd.tcpm_keylen)
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET);
+
+ if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+ return -EINVAL;
+
+ return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+ GFP_KERNEL);
+}
+
+static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
+ __be32 daddr, __be32 saddr, int nbytes)
+{
+ struct tcp4_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = &hp->md5_blk.ip4;
+
+ /*
+ * 1. the TCP pseudo-header (in the order: source IP address,
+ * destination IP address, zero-padded protocol number, and
+ * segment length)
+ */
+ bp->saddr = saddr;
+ bp->daddr = daddr;
+ bp->pad = 0;
+ bp->protocol = IPPROTO_TCP;
+ bp->len = cpu_to_be16(nbytes);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+}
+
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
+{
+ struct tcp_md5sig_pool *hp;
+ struct hash_desc *desc;
+
+ hp = tcp_get_md5sig_pool();
+ if (!hp)
+ goto clear_hash_noput;
+ desc = &hp->md5_desc;
+
+ if (crypto_hash_init(desc))
+ goto clear_hash;
+ if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
+ goto clear_hash;
+ if (tcp_md5_hash_header(hp, th))
+ goto clear_hash;
+ if (tcp_md5_hash_key(hp, key))
+ goto clear_hash;
+ if (crypto_hash_final(desc, md5_hash))
+ goto clear_hash;
+
+ tcp_put_md5sig_pool();
+ return 0;
+
+clear_hash:
+ tcp_put_md5sig_pool();
+clear_hash_noput:
+ memset(md5_hash, 0, 16);
+ return 1;
+}
+
+int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ struct tcp_md5sig_pool *hp;
+ struct hash_desc *desc;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 saddr, daddr;
+
+ if (sk) { /* valid for establish/request sockets */
+ saddr = sk->sk_rcv_saddr;
+ daddr = sk->sk_daddr;
+ } else {
+ const struct iphdr *iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ }
+
+ hp = tcp_get_md5sig_pool();
+ if (!hp)
+ goto clear_hash_noput;
+ desc = &hp->md5_desc;
+
+ if (crypto_hash_init(desc))
+ goto clear_hash;
+
+ if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
+ goto clear_hash;
+ if (tcp_md5_hash_header(hp, th))
+ goto clear_hash;
+ if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+ goto clear_hash;
+ if (tcp_md5_hash_key(hp, key))
+ goto clear_hash;
+ if (crypto_hash_final(desc, md5_hash))
+ goto clear_hash;
+
+ tcp_put_md5sig_pool();
+ return 0;
+
+clear_hash:
+ tcp_put_md5sig_pool();
+clear_hash_noput:
+ memset(md5_hash, 0, 16);
+ return 1;
+}
+EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+
+/* Called with rcu_read_lock() */
+static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ /*
+ * This gets called for each TCP segment that arrives
+ * so we want to be efficient.
+ * We have 3 drop cases:
+ * o No MD5 hash and one expected.
+ * o MD5 hash and we're not expecting one.
+ * o MD5 hash and its wrong.
+ */
+ const __u8 *hash_location = NULL;
+ struct tcp_md5sig_key *hash_expected;
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+ int genhash;
+ unsigned char newhash[16];
+
+ hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
+ AF_INET);
+ hash_location = tcp_parse_md5sig_option(th);
+
+ /* We've parsed the options - do we have a hash? */
+ if (!hash_expected && !hash_location)
+ return false;
+
+ if (hash_expected && !hash_location) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ return true;
+ }
+
+ if (!hash_expected && hash_location) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ return true;
+ }
+
+ /* Okay, so this is hash_expected and hash_location -
+ * so we need to calculate the checksum.
+ */
+ genhash = tcp_v4_md5_hash_skb(newhash,
+ hash_expected,
+ NULL, skb);
+
+ if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "");
+ return true;
+ }
+ return false;
+}
+#endif
+
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->no_srccheck = inet_sk(sk_listener)->transparent;
+ ireq->opt = tcp_v4_save_options(skb);
+}
+
+static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+ const struct request_sock *req,
+ bool *strict)
+{
+ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
+
+ if (strict) {
+ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
+ *strict = true;
+ else
+ *strict = false;
+ }
+
+ return dst;
+}
+
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct tcp_request_sock),
+ .rtx_syn_ack = tcp_rtx_synack,
+ .send_ack = tcp_v4_reqsk_send_ack,
+ .destructor = tcp_v4_reqsk_destructor,
+ .send_reset = tcp_v4_send_reset,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
+};
+
+static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+ .mss_clamp = TCP_MSS_DEFAULT,
+#ifdef CONFIG_TCP_MD5SIG
+ .req_md5_lookup = tcp_v4_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
+#endif
+ .init_req = tcp_v4_init_req,
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v4_init_sequence,
+#endif
+ .route_req = tcp_v4_route_req,
+ .init_seq = tcp_v4_init_sequence,
+ .send_synack = tcp_v4_send_synack,
+ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
+};
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ /* Never answer to SYNs send to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ return tcp_conn_request(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, skb);
+
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_v4_conn_request);
+
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
+ struct tcp_sock *newtp;
+ struct sock *newsk;
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *key;
+#endif
+ struct ip_options_rcu *inet_opt;
+
+ if (sk_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ newsk = tcp_create_openreq_child(sk, req, skb);
+ if (!newsk)
+ goto exit_nonewsk;
+
+ newsk->sk_gso_type = SKB_GSO_TCPV4;
+ inet_sk_rx_dst_set(newsk, skb);
+
+ newtp = tcp_sk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
+ sk_daddr_set(newsk, ireq->ir_rmt_addr);
+ sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ inet_opt = ireq->opt;
+ rcu_assign_pointer(newinet->inet_opt, inet_opt);
+ ireq->opt = NULL;
+ newinet->mc_index = inet_iif(skb);
+ newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->rcv_tos = ip_hdr(skb)->tos;
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ inet_set_txhash(newsk);
+ if (inet_opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ newinet->inet_id = newtp->write_seq ^ jiffies;
+
+ if (!dst) {
+ dst = inet_csk_route_child_sock(sk, newsk, req);
+ if (!dst)
+ goto put_and_exit;
+ } else {
+ /* syncookie case : see end of cookie_v4_check() */
+ }
+ sk_setup_caps(newsk, dst);
+
+ tcp_ca_openreq_child(newsk, dst);
+
+ tcp_sync_mss(newsk, dst_mtu(dst));
+ newtp->advmss = dst_metric_advmss(dst);
+ if (tcp_sk(sk)->rx_opt.user_mss &&
+ tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+ newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
+ tcp_initialize_rcv_mss(newsk);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Copy over the MD5 key from the original socket */
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET);
+ if (key) {
+ /*
+ * We're using one, so create a matching key
+ * on the newsk structure. If we fail to get
+ * memory, then we end up not copying the key
+ * across. Shucks.
+ */
+ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET, key->key, key->keylen, GFP_ATOMIC);
+ sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+ }
+#endif
+
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ __inet_hash_nolisten(newsk, NULL);
+
+ return newsk;
+
+exit_overflow:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
+exit:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto exit;
+}
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct request_sock *req;
+ struct sock *nsk;
+
+ req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
+ if (req) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk)
+ reqsk_put(req);
+ return nsk;
+ }
+
+ nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
+ th->source, iph->daddr, th->dest, inet_iif(skb));
+
+ if (nsk) {
+ if (nsk->sk_state != TCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put(inet_twsk(nsk));
+ return NULL;
+ }
+
+#ifdef CONFIG_SYN_COOKIES
+ if (!th->syn)
+ sk = cookie_v4_check(sk, skb);
+#endif
+ return sk;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th = tcp_hdr(skb);
+ struct sock *rsk;
+
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ struct dst_entry *dst = sk->sk_rx_dst;
+
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (dst) {
+ if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ !dst->ops->check(dst, 0)) {
+ dst_release(dst);
+ sk->sk_rx_dst = NULL;
+ }
+ }
+ tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+ return 0;
+ }
+
+ if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ goto csum_err;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+ unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH) &&
+ tcp_stealth_do_auth(sk, skb)) {
+ rsk = sk;
+ goto reset;
+ }
+#endif
+
+ if (sk->sk_state == TCP_LISTEN) {
+ struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ if (nsk != sk) {
+ sock_rps_save_rxhash(nsk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (tcp_child_process(sk, nsk, skb)) {
+ rsk = nsk;
+ goto reset;
+ }
+ return 0;
+ }
+ } else
+ sock_rps_save_rxhash(sk, skb);
+
+ if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+ rsk = sk;
+ goto reset;
+ }
+ return 0;
+
+reset:
+ tcp_v4_send_reset(rsk, skb);
+discard:
+ kfree_skb(skb);
+ /* Be careful here. If this function gets more complicated and
+ * gcc suffers from register pressure on the x86, sk (in %ebx)
+ * might be destroyed here. This current version compiles correctly,
+ * but you have been warned.
+ */
+ return 0;
+
+csum_err:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ goto discard;
+}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+
+void tcp_v4_early_demux(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return;
+
+ sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ skb->skb_iif);
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk_fullsock(sk)) {
+ struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst &&
+ inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
+}
+
+/* Packet is added to VJ-style prequeue for processing in process
+ * context, if a reader task is waiting. Apparently, this exciting
+ * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
+ * failed somewhere. Latency? Burstiness? Well, at least now we will
+ * see, why it failed. 8)8) --ANK
+ *
+ */
+bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sysctl_tcp_low_latency || !tp->ucopy.task)
+ return false;
+
+ if (skb->len <= tcp_hdrlen(skb) &&
+ skb_queue_len(&tp->ucopy.prequeue) == 0)
+ return false;
+
+ /* Before escaping RCU protected region, we need to take care of skb
+ * dst. Prequeue is only enabled for established sockets.
+ * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
+ * Instead of doing full sk_rx_dst validity here, let's perform
+ * an optimistic check.
+ */
+ if (likely(sk->sk_rx_dst))
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
+ __skb_queue_tail(&tp->ucopy.prequeue, skb);
+ tp->ucopy.memory += skb->truesize;
+ if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ struct sk_buff *skb1;
+
+ BUG_ON(sock_owned_by_user(sk));
+
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ sk_backlog_rcv(sk, skb1);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPPREQUEUEDROPPED);
+ }
+
+ tp->ucopy.memory = 0;
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
+ wake_up_interruptible_sync_poll(sk_sleep(sk),
+ POLLIN | POLLRDNORM | POLLRDBAND);
+ if (!inet_csk_ack_scheduled(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ (3 * tcp_rto_min(sk)) / 4,
+ TCP_RTO_MAX);
+ }
+ return true;
+}
+EXPORT_SYMBOL(tcp_prequeue);
+
+/*
+ * From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk;
+ int ret;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto discard_it;
+
+ /* Count it even if it's bad */
+ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ goto discard_it;
+
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ goto bad_packet;
+ if (!pskb_may_pull(skb, th->doff * 4))
+ goto discard_it;
+
+ /* An explanation is required here, I think.
+ * Packet length and doff are validated by header prediction,
+ * provided case of th->doff==0 is eliminated.
+ * So, we defer the checks. */
+
+ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
+ goto csum_error;
+
+ th = tcp_hdr(skb);
+ iph = ip_hdr(skb);
+ /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+ * barrier() makes sure compiler wont play fool^Waliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+ sizeof(struct inet_skb_parm));
+ barrier();
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff * 4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+ if (!sk)
+ goto no_tcp_socket;
+
+process:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+#ifdef CONFIG_TCP_MD5SIG
+ /*
+ * We really want to reject the packet as early as possible
+ * if:
+ * o We're expecting an MD5'd packet and this is no MD5 tcp option
+ * o There is an MD5 option and we're not expecting one
+ */
+ if (tcp_v4_inbound_md5_hash(sk, skb))
+ goto discard_and_relse;
+#endif
+
+ nf_reset(skb);
+
+ if (sk_filter(sk, skb))
+ goto discard_and_relse;
+
+ sk_incoming_cpu_update(sk);
+ skb->dev = NULL;
+
+ bh_lock_sock_nested(sk);
+ ret = 0;
+ if (!sock_owned_by_user(sk)) {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
+ bh_unlock_sock(sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ goto discard_and_relse;
+ }
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+ return ret;
+
+no_tcp_socket:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+
+ if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+csum_error:
+ TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
+bad_packet:
+ TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ } else {
+ tcp_v4_send_reset(NULL, skb);
+ }
+
+discard_it:
+ /* Discard frame. */
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto discard_it;
+ }
+
+ if (skb->len < (th->doff << 2)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto bad_packet;
+ }
+ if (tcp_checksum_complete(skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto csum_error;
+ }
+ switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+ case TCP_TW_SYN: {
+ struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
+ &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, th->dest,
+ inet_iif(skb));
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk));
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ goto process;
+ }
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+ tcp_v4_timewait_ack(sk, skb);
+ break;
+ case TCP_TW_RST:
+ goto no_tcp_socket;
+ case TCP_TW_SUCCESS:;
+ }
+ goto discard_it;
+}
+
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+ .twsk_destructor= tcp_twsk_destructor,
+};
+
+void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (dst) {
+ dst_hold(dst);
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ }
+}
+EXPORT_SYMBOL(inet_sk_rx_dst_set);
+
+const struct inet_connection_sock_af_ops ipv4_specific = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = tcp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
+ .conn_request = tcp_v4_conn_request,
+ .syn_recv_sock = tcp_v4_syn_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+ .bind_conflict = inet_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ip_setsockopt,
+ .compat_getsockopt = compat_ip_getsockopt,
+#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
+};
+EXPORT_SYMBOL(ipv4_specific);
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+ .md5_lookup = tcp_v4_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
+ .md5_parse = tcp_v4_parse_md5_keys,
+};
+#endif
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_init_sock(sk);
+
+ icsk->icsk_af_ops = &ipv4_specific;
+
+#ifdef CONFIG_TCP_MD5SIG
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+#endif
+
+ return 0;
+}
+
+void tcp_v4_destroy_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_clear_xmit_timers(sk);
+
+ tcp_cleanup_congestion_control(sk);
+
+ /* Cleanup up the write buffer. */
+ tcp_write_queue_purge(sk);
+
+ /* Cleans up our, hopefully empty, out_of_order_queue. */
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Clean up the MD5 key list, if any */
+ if (tp->md5sig_info) {
+ tcp_clear_md5_list(sk);
+ kfree_rcu(tp->md5sig_info, rcu);
+ tp->md5sig_info = NULL;
+ }
+#endif
+
+ /* Clean prequeue, it must be empty really */
+ __skb_queue_purge(&tp->ucopy.prequeue);
+
+ /* Clean up a referenced TCP bind bucket. */
+ if (inet_csk(sk)->icsk_bind_hash)
+ inet_put_port(sk);
+
+ BUG_ON(tp->fastopen_rsk);
+
+ /* If socket is aborted during connect operation */
+ tcp_free_fastopen_req(tp);
+
+ sk_sockets_allocated_dec(sk);
+ sock_release_memcg(sk);
+}
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+
+/*
+ * Get next listener socket follow cur. If cur is NULL, get first socket
+ * starting from bucket given in st->bucket; when st->bucket is zero the
+ * very first socket in the hash table is returned.
+ */
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+ struct inet_connection_sock *icsk;
+ struct hlist_nulls_node *node;
+ struct sock *sk = cur;
+ struct inet_listen_hashbucket *ilb;
+ struct tcp_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ if (!sk) {
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ spin_lock_bh(&ilb->lock);
+ sk = sk_nulls_head(&ilb->head);
+ st->offset = 0;
+ goto get_sk;
+ }
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ ++st->num;
+ ++st->offset;
+
+ if (st->state == TCP_SEQ_STATE_OPENREQ) {
+ struct request_sock *req = cur;
+
+ icsk = inet_csk(st->syn_wait_sk);
+ req = req->dl_next;
+ while (1) {
+ while (req) {
+ if (req->rsk_ops->family == st->family) {
+ cur = req;
+ goto out;
+ }
+ req = req->dl_next;
+ }
+ if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
+ break;
+get_req:
+ req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
+ }
+ sk = sk_nulls_next(st->syn_wait_sk);
+ st->state = TCP_SEQ_STATE_LISTENING;
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ } else {
+ icsk = inet_csk(sk);
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue))
+ goto start_req;
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ sk = sk_nulls_next(sk);
+ }
+get_sk:
+ sk_nulls_for_each_from(sk, node) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_family == st->family) {
+ cur = sk;
+ goto out;
+ }
+ icsk = inet_csk(sk);
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
+start_req:
+ st->uid = sock_i_uid(sk);
+ st->syn_wait_sk = sk;
+ st->state = TCP_SEQ_STATE_OPENREQ;
+ st->sbucket = 0;
+ goto get_req;
+ }
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ }
+ spin_unlock_bh(&ilb->lock);
+ st->offset = 0;
+ if (++st->bucket < INET_LHTABLE_SIZE) {
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ spin_lock_bh(&ilb->lock);
+ sk = sk_nulls_head(&ilb->head);
+ goto get_sk;
+ }
+ cur = NULL;
+out:
+ return cur;
+}
+
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ st->offset = 0;
+ rc = listening_get_next(seq, NULL);
+
+ while (rc && *pos) {
+ rc = listening_get_next(seq, rc);
+ --*pos;
+ }
+ return rc;
+}
+
+static inline bool empty_bucket(const struct tcp_iter_state *st)
+{
+ return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
+}
+
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
+static void *established_get_first(struct seq_file *seq)
+{
+ struct tcp_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+ void *rc = NULL;
+
+ st->offset = 0;
+ for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
+
+ /* Lockless fast path for the common case of empty buckets */
+ if (empty_bucket(st))
+ continue;
+
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+ if (sk->sk_family != st->family ||
+ !net_eq(sock_net(sk), net)) {
+ continue;
+ }
+ rc = sk;
+ goto out;
+ }
+ spin_unlock_bh(lock);
+ }
+out:
+ return rc;
+}
+
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+ struct sock *sk = cur;
+ struct hlist_nulls_node *node;
+ struct tcp_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ ++st->num;
+ ++st->offset;
+
+ sk = sk_nulls_next(sk);
+
+ sk_nulls_for_each_from(sk, node) {
+ if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+ return sk;
+ }
+
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ ++st->bucket;
+ return established_get_first(seq);
+}
+
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ rc = established_get_first(seq);
+
+ while (rc && pos) {
+ rc = established_get_next(seq, rc);
+ --pos;
+ }
+ return rc;
+}
+
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ void *rc;
+ struct tcp_iter_state *st = seq->private;
+
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_idx(seq, &pos);
+
+ if (!rc) {
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ rc = established_get_idx(seq, pos);
+ }
+
+ return rc;
+}
+
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+ struct tcp_iter_state *st = seq->private;
+ int offset = st->offset;
+ int orig_num = st->num;
+ void *rc = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ if (st->bucket >= INET_LHTABLE_SIZE)
+ break;
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_next(seq, NULL);
+ while (offset-- && rc)
+ rc = listening_get_next(seq, rc);
+ if (rc)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ /* Fallthrough */
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (st->bucket > tcp_hashinfo.ehash_mask)
+ break;
+ rc = established_get_first(seq);
+ while (offset-- && rc)
+ rc = established_get_next(seq, rc);
+ }
+
+ st->num = orig_num;
+
+ return rc;
+}
+
+static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ if (*pos && *pos == st->last_pos) {
+ rc = tcp_seek_last_pos(seq);
+ if (rc)
+ goto out;
+ }
+
+ st->state = TCP_SEQ_STATE_LISTENING;
+ st->num = 0;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+ st->last_pos = *pos;
+ return rc;
+}
+
+static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc = NULL;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = tcp_get_idx(seq, 0);
+ goto out;
+ }
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ rc = listening_get_next(seq, v);
+ if (!rc) {
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = established_get_first(seq);
+ }
+ break;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ rc = established_get_next(seq, v);
+ break;
+ }
+out:
+ ++*pos;
+ st->last_pos = *pos;
+ return rc;
+}
+
+static void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state *st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ if (v) {
+ struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ }
+ case TCP_SEQ_STATE_LISTENING:
+ if (v != SEQ_START_TOKEN)
+ spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ break;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (v)
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ break;
+ }
+}
+
+int tcp_seq_open(struct inode *inode, struct file *file)
+{
+ struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
+ struct tcp_iter_state *s;
+ int err;
+
+ err = seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct tcp_iter_state));
+ if (err < 0)
+ return err;
+
+ s = ((struct seq_file *)file->private_data)->private;
+ s->family = afinfo->family;
+ s->last_pos = 0;
+ return 0;
+}
+EXPORT_SYMBOL(tcp_seq_open);
+
+int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
+{
+ int rc = 0;
+ struct proc_dir_entry *p;
+
+ afinfo->seq_ops.start = tcp_seq_start;
+ afinfo->seq_ops.next = tcp_seq_next;
+ afinfo->seq_ops.stop = tcp_seq_stop;
+
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ rc = -ENOMEM;
+ return rc;
+}
+EXPORT_SYMBOL(tcp_proc_register);
+
+void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(tcp_proc_unregister);
+
+static void get_openreq4(const struct request_sock *req,
+ struct seq_file *f, int i, kuid_t uid)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ long delta = req->rsk_timer.expires - jiffies;
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
+ i,
+ ireq->ir_loc_addr,
+ ireq->ir_num,
+ ireq->ir_rmt_addr,
+ ntohs(ireq->ir_rmt_port),
+ TCP_SYN_RECV,
+ 0, 0, /* could print option size, but that is af dependent. */
+ 1, /* timers active (only the expire timer) */
+ jiffies_delta_to_clock_t(delta),
+ req->num_timeout,
+ from_kuid_munged(seq_user_ns(f), uid),
+ 0, /* non standard timer */
+ 0, /* open_requests have no inode */
+ 0,
+ req);
+}
+
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
+{
+ int timer_active;
+ unsigned long timer_expires;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+ int rx_queue;
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ timer_active = 1;
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_active = 4;
+ timer_expires = icsk->icsk_timeout;
+ } else if (timer_pending(&sk->sk_timer)) {
+ timer_active = 2;
+ timer_expires = sk->sk_timer.expires;
+ } else {
+ timer_active = 0;
+ timer_expires = jiffies;
+ }
+
+ if (sk->sk_state == TCP_LISTEN)
+ rx_queue = sk->sk_ack_backlog;
+ else
+ /*
+ * because we dont lock socket, we might find a transient negative value
+ */
+ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
+ i, src, srcp, dest, destp, sk->sk_state,
+ tp->write_seq - tp->snd_una,
+ rx_queue,
+ timer_active,
+ jiffies_delta_to_clock_t(timer_expires - jiffies),
+ icsk->icsk_retransmits,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+ icsk->icsk_probes_out,
+ sock_i_ino(sk),
+ atomic_read(&sk->sk_refcnt), sk,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
+ tp->snd_cwnd,
+ sk->sk_state == TCP_LISTEN ?
+ (fastopenq ? fastopenq->max_qlen : 0) :
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
+}
+
+static void get_timewait4_sock(const struct inet_timewait_sock *tw,
+ struct seq_file *f, int i)
+{
+ long delta = tw->tw_timer.expires - jiffies;
+ __be32 dest, src;
+ __u16 destp, srcp;
+
+ dest = tw->tw_daddr;
+ src = tw->tw_rcv_saddr;
+ destp = ntohs(tw->tw_dport);
+ srcp = ntohs(tw->tw_sport);
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
+ i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ atomic_read(&tw->tw_refcnt), tw);
+}
+
+#define TMPSZ 150
+
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state *st;
+ struct sock *sk = v;
+
+ seq_setwidth(seq, TMPSZ - 1);
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode");
+ goto out;
+ }
+ st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
+ break;
+ case TCP_SEQ_STATE_OPENREQ:
+ get_openreq4(v, seq, st->num, st->uid);
+ break;
+ }
+out:
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct file_operations tcp_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = tcp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+ .name = "tcp",
+ .family = AF_INET,
+ .seq_fops = &tcp_afinfo_seq_fops,
+ .seq_ops = {
+ .show = tcp4_seq_show,
+ },
+};
+
+static int __net_init tcp4_proc_init_net(struct net *net)
+{
+ return tcp_proc_register(net, &tcp4_seq_afinfo);
+}
+
+static void __net_exit tcp4_proc_exit_net(struct net *net)
+{
+ tcp_proc_unregister(net, &tcp4_seq_afinfo);
+}
+
+static struct pernet_operations tcp4_net_ops = {
+ .init = tcp4_proc_init_net,
+ .exit = tcp4_proc_exit_net,
+};
+
+int __init tcp4_proc_init(void)
+{
+ return register_pernet_subsys(&tcp4_net_ops);
+}
+
+void tcp4_proc_exit(void)
+{
+ unregister_pernet_subsys(&tcp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+struct proto tcp_prot = {
+ .name = "TCP",
+ .owner = THIS_MODULE,
+ .close = tcp_close,
+ .connect = tcp_v4_connect,
+ .disconnect = tcp_disconnect,
+ .accept = inet_csk_accept,
+ .ioctl = tcp_ioctl,
+ .init = tcp_v4_init_sock,
+ .destroy = tcp_v4_destroy_sock,
+ .shutdown = tcp_shutdown,
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
+ .recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .sendpage = tcp_sendpage,
+ .backlog_rcv = tcp_v4_do_rcv,
+ .release_cb = tcp_release_cb,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = inet_csk_get_port,
+ .enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
+ .sockets_allocated = &tcp_sockets_allocated,
+ .orphan_count = &tcp_orphan_count,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
+ .sysctl_mem = sysctl_tcp_mem,
+ .sysctl_wmem = sysctl_tcp_wmem,
+ .sysctl_rmem = sysctl_tcp_rmem,
+ .max_header = MAX_TCP_HEADER,
+ .obj_size = sizeof(struct tcp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .twsk_prot = &tcp_timewait_sock_ops,
+ .rsk_prot = &tcp_request_sock_ops,
+ .h.hashinfo = &tcp_hashinfo,
+ .no_autobind = true,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_tcp_setsockopt,
+ .compat_getsockopt = compat_tcp_getsockopt,
+#endif
+#ifdef CONFIG_MEMCG_KMEM
+ .init_cgroup = tcp_init_cgroup,
+ .destroy_cgroup = tcp_destroy_cgroup,
+ .proto_cgroup = tcp_proto_cgroup,
+#endif
+};
+EXPORT_SYMBOL(tcp_prot);
+
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+ free_percpu(net->ipv4.tcp_sk);
+}
+
+static int __net_init tcp_sk_init(struct net *net)
+{
+ int res, cpu;
+
+ net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+ if (!net->ipv4.tcp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, net);
+ if (res)
+ goto fail;
+ *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+ }
+ net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ return 0;
+
+fail:
+ tcp_sk_exit(net);
+
+ return res;
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static struct pernet_operations __net_initdata tcp_sk_ops = {
+ .init = tcp_sk_init,
+ .exit = tcp_sk_exit,
+ .exit_batch = tcp_sk_exit_batch,
+};
+
+void __init tcp_v4_init(void)
+{
+ inet_hashinfo_init(&tcp_hashinfo);
+ if (register_pernet_subsys(&tcp_sk_ops))
+ panic("Failed to create the TCP control socket.\n");
+}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 000000000..1e70fa8fa
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,342 @@
+/*
+ * TCP Low Priority (TCP-LP)
+ *
+ * TCP Low Priority is a distributed algorithm whose goal is to utilize only
+ * the excess network bandwidth as compared to the ``fair share`` of
+ * bandwidth as targeted by TCP.
+ *
+ * As of 2.6.13, Linux supports pluggable congestion control algorithms.
+ * Due to the limitation of the API, we take the following changes from
+ * the original TCP-LP implementation:
+ * o We use newReno in most core CA handling. Only add some checking
+ * within cong_avoid.
+ * o Error correcting in remote HZ, therefore remote HZ will be keeped
+ * on checking and updating.
+ * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
+ * OWD have a similar meaning as RTT. Also correct the buggy formular.
+ * o Handle reaction for Early Congestion Indication (ECI) within
+ * pkts_acked, as mentioned within pseudo code.
+ * o OWD is handled in relative format, where local time stamp will in
+ * tcp_time_stamp format.
+ *
+ * Original Author:
+ * Aleksandar Kuzmanovic <akuzma@northwestern.edu>
+ * Available from:
+ * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * Original implementation for 2.4.19:
+ * http://www-ece.rice.edu/networks/TCP-LP/
+ *
+ * 2.6.x module Authors:
+ * Wong Hoi Sing, Edison <hswong3i@gmail.com>
+ * Hung Hing Lun, Mike <hlhung3i@gmail.com>
+ * SourceForge project page:
+ * http://tcp-lp-mod.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* resolution of owd */
+#define LP_RESOL 1000
+
+/**
+ * enum tcp_lp_state
+ * @LP_VALID_RHZ: is remote HZ valid?
+ * @LP_VALID_OWD: is OWD valid?
+ * @LP_WITHIN_THR: are we within threshold?
+ * @LP_WITHIN_INF: are we within inference?
+ *
+ * TCP-LP's state flags.
+ * We create this set of state flag mainly for debugging.
+ */
+enum tcp_lp_state {
+ LP_VALID_RHZ = (1 << 0),
+ LP_VALID_OWD = (1 << 1),
+ LP_WITHIN_THR = (1 << 3),
+ LP_WITHIN_INF = (1 << 4),
+};
+
+/**
+ * struct lp
+ * @flag: TCP-LP state flag
+ * @sowd: smoothed OWD << 3
+ * @owd_min: min OWD
+ * @owd_max: max OWD
+ * @owd_max_rsv: resrved max owd
+ * @remote_hz: estimated remote HZ
+ * @remote_ref_time: remote reference time
+ * @local_ref_time: local reference time
+ * @last_drop: time for last active drop
+ * @inference: current inference
+ *
+ * TCP-LP's private struct.
+ * We get the idea from original TCP-LP implementation where only left those we
+ * found are really useful.
+ */
+struct lp {
+ u32 flag;
+ u32 sowd;
+ u32 owd_min;
+ u32 owd_max;
+ u32 owd_max_rsv;
+ u32 remote_hz;
+ u32 remote_ref_time;
+ u32 local_ref_time;
+ u32 last_drop;
+ u32 inference;
+};
+
+/**
+ * tcp_lp_init
+ *
+ * Init all required variables.
+ * Clone the handling from Vegas module implementation.
+ */
+static void tcp_lp_init(struct sock *sk)
+{
+ struct lp *lp = inet_csk_ca(sk);
+
+ lp->flag = 0;
+ lp->sowd = 0;
+ lp->owd_min = 0xffffffff;
+ lp->owd_max = 0;
+ lp->owd_max_rsv = 0;
+ lp->remote_hz = 0;
+ lp->remote_ref_time = 0;
+ lp->local_ref_time = 0;
+ lp->last_drop = 0;
+ lp->inference = 0;
+}
+
+/**
+ * tcp_lp_cong_avoid
+ *
+ * Implementation of cong_avoid.
+ * Will only call newReno CA when away from inference.
+ * From TCP-LP's paper, this will be handled in additive increasement.
+ */
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct lp *lp = inet_csk_ca(sk);
+
+ if (!(lp->flag & LP_WITHIN_INF))
+ tcp_reno_cong_avoid(sk, ack, acked);
+}
+
+/**
+ * tcp_lp_remote_hz_estimator
+ *
+ * Estimate remote HZ.
+ * We keep on updating the estimated value, where original TCP-LP
+ * implementation only guest it for once and use forever.
+ */
+static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct lp *lp = inet_csk_ca(sk);
+ s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */
+ s64 m = 0;
+
+ /* not yet record reference time
+ * go away!! record it before come back!! */
+ if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
+ goto out;
+
+ /* we can't calc remote HZ with no different!! */
+ if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+ tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+ goto out;
+
+ m = HZ * (tp->rx_opt.rcv_tsval -
+ lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
+ lp->local_ref_time);
+ if (m < 0)
+ m = -m;
+
+ if (rhz > 0) {
+ m -= rhz >> 6; /* m is now error in remote HZ est */
+ rhz += m; /* 63/64 old + 1/64 new */
+ } else
+ rhz = m << 6;
+
+ out:
+ /* record time for successful remote HZ calc */
+ if ((rhz >> 6) > 0)
+ lp->flag |= LP_VALID_RHZ;
+ else
+ lp->flag &= ~LP_VALID_RHZ;
+
+ /* record reference time stamp */
+ lp->remote_ref_time = tp->rx_opt.rcv_tsval;
+ lp->local_ref_time = tp->rx_opt.rcv_tsecr;
+
+ return rhz >> 6;
+}
+
+/**
+ * tcp_lp_owd_calculator
+ *
+ * Calculate one way delay (in relative format).
+ * Original implement OWD as minus of remote time difference to local time
+ * difference directly. As this time difference just simply equal to RTT, when
+ * the network status is stable, remote RTT will equal to local RTT, and result
+ * OWD into zero.
+ * It seems to be a bug and so we fixed it.
+ */
+static u32 tcp_lp_owd_calculator(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct lp *lp = inet_csk_ca(sk);
+ s64 owd = 0;
+
+ lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
+
+ if (lp->flag & LP_VALID_RHZ) {
+ owd =
+ tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+ tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
+ if (owd < 0)
+ owd = -owd;
+ }
+
+ if (owd > 0)
+ lp->flag |= LP_VALID_OWD;
+ else
+ lp->flag &= ~LP_VALID_OWD;
+
+ return owd;
+}
+
+/**
+ * tcp_lp_rtt_sample
+ *
+ * Implementation or rtt_sample.
+ * Will take the following action,
+ * 1. calc OWD,
+ * 2. record the min/max OWD,
+ * 3. calc smoothed OWD (SOWD).
+ * Most ideas come from the original TCP-LP implementation.
+ */
+static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
+{
+ struct lp *lp = inet_csk_ca(sk);
+ s64 mowd = tcp_lp_owd_calculator(sk);
+
+ /* sorry that we don't have valid data */
+ if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
+ return;
+
+ /* record the next min owd */
+ if (mowd < lp->owd_min)
+ lp->owd_min = mowd;
+
+ /* always forget the max of the max
+ * we just set owd_max as one below it */
+ if (mowd > lp->owd_max) {
+ if (mowd > lp->owd_max_rsv) {
+ if (lp->owd_max_rsv == 0)
+ lp->owd_max = mowd;
+ else
+ lp->owd_max = lp->owd_max_rsv;
+ lp->owd_max_rsv = mowd;
+ } else
+ lp->owd_max = mowd;
+ }
+
+ /* calc for smoothed owd */
+ if (lp->sowd != 0) {
+ mowd -= lp->sowd >> 3; /* m is now error in owd est */
+ lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */
+ } else
+ lp->sowd = mowd << 3; /* take the measured time be owd */
+}
+
+/**
+ * tcp_lp_pkts_acked
+ *
+ * Implementation of pkts_acked.
+ * Deal with active drop under Early Congestion Indication.
+ * Only drop to half and 1 will be handle, because we hope to use back
+ * newReno in increase case.
+ * We work it out by following the idea from TCP-LP's paper directly
+ */
+static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct lp *lp = inet_csk_ca(sk);
+
+ if (rtt_us > 0)
+ tcp_lp_rtt_sample(sk, rtt_us);
+
+ /* calc inference */
+ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
+ lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+
+ /* test if within inference */
+ if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
+ lp->flag |= LP_WITHIN_INF;
+ else
+ lp->flag &= ~LP_WITHIN_INF;
+
+ /* test if within threshold */
+ if (lp->sowd >> 3 <
+ lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
+ lp->flag |= LP_WITHIN_THR;
+ else
+ lp->flag &= ~LP_WITHIN_THR;
+
+ pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
+ tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+ lp->sowd >> 3);
+
+ if (lp->flag & LP_WITHIN_THR)
+ return;
+
+ /* FIXME: try to reset owd_min and owd_max here
+ * so decrease the chance the min/max is no longer suitable
+ * and will usually within threshold when whithin inference */
+ lp->owd_min = lp->sowd >> 3;
+ lp->owd_max = lp->sowd >> 2;
+ lp->owd_max_rsv = lp->sowd >> 2;
+
+ /* happened within inference
+ * drop snd_cwnd into 1 */
+ if (lp->flag & LP_WITHIN_INF)
+ tp->snd_cwnd = 1U;
+
+ /* happened after inference
+ * cut snd_cwnd into half */
+ else
+ tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+
+ /* record this drop time */
+ lp->last_drop = tcp_time_stamp;
+}
+
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
+ .init = tcp_lp_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_lp_cong_avoid,
+ .pkts_acked = tcp_lp_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "lp"
+};
+
+static int __init tcp_lp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_lp);
+}
+
+static void __exit tcp_lp_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_lp);
+}
+
+module_init(tcp_lp_register);
+module_exit(tcp_lp_unregister);
+
+MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
new file mode 100644
index 000000000..2379c1b4e
--- /dev/null
+++ b/net/ipv4/tcp_memcontrol.c
@@ -0,0 +1,233 @@
+#include <net/tcp.h>
+#include <net/tcp_memcontrol.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <linux/nsproxy.h>
+#include <linux/memcontrol.h>
+#include <linux/module.h>
+
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ /*
+ * The root cgroup does not use page_counters, but rather,
+ * rely on the data already collected by the network
+ * subsystem
+ */
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct page_counter *counter_parent = NULL;
+ struct cg_proto *cg_proto, *parent_cg;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return 0;
+
+ cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
+ cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
+ cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
+ cg_proto->memory_pressure = 0;
+ cg_proto->memcg = memcg;
+
+ parent_cg = tcp_prot.proto_cgroup(parent);
+ if (parent_cg)
+ counter_parent = &parent_cg->memory_allocated;
+
+ page_counter_init(&cg_proto->memory_allocated, counter_parent);
+ percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+void tcp_destroy_cgroup(struct mem_cgroup *memcg)
+{
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return;
+
+ percpu_counter_destroy(&cg_proto->sockets_allocated);
+
+ if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+
+}
+EXPORT_SYMBOL(tcp_destroy_cgroup);
+
+static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
+{
+ struct cg_proto *cg_proto;
+ int i;
+ int ret;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return -EINVAL;
+
+ ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < 3; i++)
+ cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
+ sysctl_tcp_mem[i]);
+
+ if (nr_pages == PAGE_COUNTER_MAX)
+ clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ else {
+ /*
+ * The active bit needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ *
+ * The activated bit is used to guarantee that no two writers
+ * will do the update in the same memcg. Without that, we can't
+ * properly shutdown the static key.
+ */
+ if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ }
+
+ return 0;
+}
+
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_MAX_USAGE,
+ RES_FAILCNT,
+};
+
+static DEFINE_MUTEX(tcp_limit_mutex);
+
+static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long nr_pages;
+ int ret = 0;
+
+ buf = strstrip(buf);
+
+ switch (of_cft(of)->private) {
+ case RES_LIMIT:
+ /* see memcontrol.c */
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
+ if (ret)
+ break;
+ mutex_lock(&tcp_limit_mutex);
+ ret = tcp_update_limit(memcg, nr_pages);
+ mutex_unlock(&tcp_limit_mutex);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
+ u64 val;
+
+ switch (cft->private) {
+ case RES_LIMIT:
+ if (!cg_proto)
+ return PAGE_COUNTER_MAX;
+ val = cg_proto->memory_allocated.limit;
+ val *= PAGE_SIZE;
+ break;
+ case RES_USAGE:
+ if (!cg_proto)
+ val = atomic_long_read(&tcp_memory_allocated);
+ else
+ val = page_counter_read(&cg_proto->memory_allocated);
+ val *= PAGE_SIZE;
+ break;
+ case RES_FAILCNT:
+ if (!cg_proto)
+ return 0;
+ val = cg_proto->memory_allocated.failcnt;
+ break;
+ case RES_MAX_USAGE:
+ if (!cg_proto)
+ return 0;
+ val = cg_proto->memory_allocated.watermark;
+ val *= PAGE_SIZE;
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
+
+ memcg = mem_cgroup_from_css(of_css(of));
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return nbytes;
+
+ switch (of_cft(of)->private) {
+ case RES_MAX_USAGE:
+ page_counter_reset_watermark(&cg_proto->memory_allocated);
+ break;
+ case RES_FAILCNT:
+ cg_proto->memory_allocated.failcnt = 0;
+ break;
+ }
+
+ return nbytes;
+}
+
+static struct cftype tcp_files[] = {
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .write = tcp_cgroup_write,
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_LIMIT,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_USAGE,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = RES_FAILCNT,
+ .write = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .write = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ { } /* terminate */
+};
+
+static int __init tcp_memcontrol_init(void)
+{
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
+ return 0;
+}
+__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000..a51d63a43
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,1198 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+#include <linux/tcp_metrics.h>
+#include <linux/vmalloc.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/genetlink.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash);
+
+struct tcp_fastopen_metrics {
+ u16 mss;
+ u16 syn_loss:10, /* Recurring Fast Open SYN losses */
+ try_exp:2; /* Request w/ exp. option (once) */
+ unsigned long last_syn_loss; /* Last Fast Open SYN loss */
+ struct tcp_fastopen_cookie cookie;
+};
+
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
+
+struct tcp_metrics_block {
+ struct tcp_metrics_block __rcu *tcpm_next;
+ possible_net_t tcpm_net;
+ struct inetpeer_addr tcpm_saddr;
+ struct inetpeer_addr tcpm_daddr;
+ unsigned long tcpm_stamp;
+ u32 tcpm_ts;
+ u32 tcpm_ts_stamp;
+ u32 tcpm_lock;
+ u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
+ struct tcp_fastopen_metrics tcpm_fastopen;
+
+ struct rcu_head rcu_head;
+};
+
+static inline struct net *tm_net(struct tcp_metrics_block *tm)
+{
+ return read_pnet(&tm->tcpm_net);
+}
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_vals[idx];
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = val;
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ if (a->family != b->family)
+ return false;
+ if (a->family == AF_INET)
+ return a->addr.a4 == b->addr.a4;
+ return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
+}
+
+struct tcpm_hash_bucket {
+ struct tcp_metrics_block __rcu *chain;
+};
+
+static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
+static unsigned int tcp_metrics_hash_log __read_mostly;
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst,
+ bool fastopen_clear)
+{
+ u32 msval;
+ u32 val;
+
+ tm->tcpm_stamp = jiffies;
+
+ val = 0;
+ if (dst_metric_locked(dst, RTAX_RTT))
+ val |= 1 << TCP_METRIC_RTT;
+ if (dst_metric_locked(dst, RTAX_RTTVAR))
+ val |= 1 << TCP_METRIC_RTTVAR;
+ if (dst_metric_locked(dst, RTAX_SSTHRESH))
+ val |= 1 << TCP_METRIC_SSTHRESH;
+ if (dst_metric_locked(dst, RTAX_CWND))
+ val |= 1 << TCP_METRIC_CWND;
+ if (dst_metric_locked(dst, RTAX_REORDERING))
+ val |= 1 << TCP_METRIC_REORDERING;
+ tm->tcpm_lock = val;
+
+ msval = dst_metric_raw(dst, RTAX_RTT);
+ tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+
+ msval = dst_metric_raw(dst, RTAX_RTTVAR);
+ tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
+ tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+ tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+ tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tm->tcpm_ts = 0;
+ tm->tcpm_ts_stamp = 0;
+ if (fastopen_clear) {
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.try_exp = 0;
+ tm->tcpm_fastopen.cookie.exp = false;
+ tm->tcpm_fastopen.cookie.len = 0;
+ }
+}
+
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ tcpm_suck_dst(tm, dst, false);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
+#define deref_locked(p) \
+ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+ struct inetpeer_addr *saddr,
+ struct inetpeer_addr *daddr,
+ unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ struct net *net;
+ bool reclaim = false;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ net = dev_net(dst->dev);
+
+ /* While waiting for the spin-lock the cache might have been populated
+ * with this entry and so we have to check again.
+ */
+ tm = __tcp_get_metrics(saddr, daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (tm) {
+ tcpm_check_stamp(tm, dst);
+ goto out_unlock;
+ }
+
+ if (unlikely(reclaim)) {
+ struct tcp_metrics_block *oldest;
+
+ oldest = deref_locked(tcp_metrics_hash[hash].chain);
+ for (tm = deref_locked(oldest->tcpm_next); tm;
+ tm = deref_locked(tm->tcpm_next)) {
+ if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ oldest = tm;
+ }
+ tm = oldest;
+ } else {
+ tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ if (!tm)
+ goto out_unlock;
+ }
+ write_pnet(&tm->tcpm_net, net);
+ tm->tcpm_saddr = *saddr;
+ tm->tcpm_daddr = *daddr;
+
+ tcpm_suck_dst(tm, dst, true);
+
+ if (likely(!reclaim)) {
+ tm->tcpm_next = tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
+ }
+
+out_unlock:
+ spin_unlock_bh(&tcp_metrics_lock);
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+ if (tm)
+ return tm;
+ if (depth > TCP_METRICS_RECLAIM_DEPTH)
+ return TCP_METRICS_RECLAIM_PTR;
+ return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ int depth = 0;
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, saddr) &&
+ addr_same(&tm->tcpm_daddr, daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ depth++;
+ }
+ return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ saddr.family = req->rsk_ops->family;
+ daddr.family = req->rsk_ops->family;
+ switch (daddr.family) {
+ case AF_INET:
+ saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
+ daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
+ daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
+ hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
+ break;
+#endif
+ default:
+ return NULL;
+ }
+
+ net = dev_net(dst->dev);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ }
+ tcpm_check_stamp(tm, dst);
+ return tm;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (tw->tw_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ saddr.addr.in6 = tw->tw_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ daddr.addr.in6 = tw->tw_v6_daddr;
+ hash = ipv6_addr_hash(&tw->tw_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = twsk_net(tw);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ }
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+ struct dst_entry *dst,
+ bool create)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (sk->sk_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ saddr.addr.in6 = sk->sk_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ daddr.addr.in6 = sk->sk_v6_daddr;
+ hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = dev_net(dst->dev);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR)
+ tm = NULL;
+ if (!tm && create)
+ tm = tcpm_new(dst, &saddr, &daddr, hash);
+ else
+ tcpm_check_stamp(tm, dst);
+
+ return tm;
+}
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ unsigned long rtt;
+ u32 val;
+ int m;
+
+ if (sysctl_tcp_nometrics_save || !dst)
+ return;
+
+ if (dst->flags & DST_HOST)
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ if (icsk->icsk_backoff || !tp->srtt_us) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time. Reset our
+ * results.
+ */
+ tm = tcp_get_metrics(sk, dst, false);
+ if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+ tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+ goto out_unlock;
+ } else
+ tm = tcp_get_metrics(sk, dst, true);
+
+ if (!tm)
+ goto out_unlock;
+
+ rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt_us;
+
+ /* If newly calculated rtt larger than stored one, store new
+ * one. Otherwise, use EWMA. Remember, rtt overestimation is
+ * always better than underestimation.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+ if (m <= 0)
+ rtt = tp->srtt_us;
+ else
+ rtt -= (m >> 3);
+ tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
+ }
+
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+ unsigned long var;
+
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev_us)
+ m = tp->mdev_us;
+
+ var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && (tp->snd_cwnd >> 1) > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_cwnd >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ if (tp->snd_cwnd > val)
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ tp->snd_cwnd);
+ }
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+ }
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ * ssthresh may be also invalid.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ (val + tp->snd_ssthresh) >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && tp->snd_ssthresh > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_ssthresh);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ tp->reordering);
+ }
+ }
+ tm->tcpm_stamp = jiffies;
+out_unlock:
+ rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ u32 val, crtt = 0; /* cached RTT scaled by 8 */
+
+ if (!dst)
+ goto reset;
+
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (!tm) {
+ rcu_read_unlock();
+ goto reset;
+ }
+
+ if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+ tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val) {
+ tp->snd_ssthresh = val;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ }
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val && tp->reordering != val) {
+ tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
+ tp->reordering = val;
+ }
+
+ crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ rcu_read_unlock();
+reset:
+ /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
+ * to seed the RTO for later data packets because SYN packets are
+ * small. Use the per-dst cached values to seed the RTO but keep
+ * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
+ * Later the RTO will be updated immediately upon obtaining the first
+ * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+ * influences the first RTO but not later RTT estimation.
+ *
+ * But if RTT is not available from the SYN (due to retransmits or
+ * syn cookies) or the cache, force a conservative 3secs timeout.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ if (crtt > tp->srtt_us) {
+ /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+ crtt /= 8 * USEC_PER_MSEC;
+ inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
+ } else if (tp->srtt_us == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+ tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
+
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
+ bool paws_check, bool timestamps)
+{
+ struct tcp_metrics_block *tm;
+ bool ret;
+
+ if (!dst)
+ return false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_req(req, dst);
+ if (paws_check) {
+ if (tm &&
+ (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+ ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
+ !timestamps))
+ ret = false;
+ else
+ ret = true;
+ } else {
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+ ret = true;
+ else
+ ret = false;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+ tp->rx_opt.ts_recent = tm->tcpm_ts;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ bool ret = false;
+
+ if (dst) {
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+ tm->tcpm_ts = tp->rx_opt.ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+ }
+ return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ bool ret = false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_tw(tw);
+ if (tm) {
+ const struct tcp_timewait_sock *tcptw;
+ struct sock *sk = (struct sock *) tw;
+
+ tcptw = tcp_twsk(sk);
+ if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+ tm->tcpm_ts = tcptw->tw_ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie,
+ int *syn_loss, unsigned long *last_syn_loss)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ if (tfom->mss)
+ *mss = tfom->mss;
+ *cookie = tfom->cookie;
+ if (cookie->len <= 0 && tfom->try_exp == 1)
+ cookie->exp = true;
+ *syn_loss = tfom->syn_loss;
+ *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+ }
+ rcu_read_unlock();
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost,
+ u16 try_exp)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_metrics_block *tm;
+
+ if (!dst)
+ return;
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+ write_seqlock_bh(&fastopen_seqlock);
+ if (mss)
+ tfom->mss = mss;
+ if (cookie && cookie->len > 0)
+ tfom->cookie = *cookie;
+ else if (try_exp > tfom->try_exp &&
+ tfom->cookie.len <= 0 && !tfom->cookie.exp)
+ tfom->try_exp = try_exp;
+ if (syn_lost) {
+ ++tfom->syn_loss;
+ tfom->last_syn_loss = jiffies;
+ } else
+ tfom->syn_loss = 0;
+ write_sequnlock_bh(&fastopen_seqlock);
+ }
+ rcu_read_unlock();
+}
+
+static struct genl_family tcp_metrics_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .netnsok = true,
+};
+
+static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+ [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr), },
+ /* Following attributes are not received for GET/DEL,
+ * we keep them for reference
+ */
+#if 0
+ [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
+ [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
+ [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
+ .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+ struct tcp_metrics_block *tm)
+{
+ struct nlattr *nest;
+ int i;
+
+ switch (tm->tcpm_daddr.family) {
+ case AF_INET:
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+ tm->tcpm_daddr.addr.a4) < 0)
+ goto nla_put_failure;
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+ tm->tcpm_saddr.addr.a4) < 0)
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
+ &tm->tcpm_daddr.addr.in6) < 0)
+ goto nla_put_failure;
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
+ &tm->tcpm_saddr.addr.in6) < 0)
+ goto nla_put_failure;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+ jiffies - tm->tcpm_stamp) < 0)
+ goto nla_put_failure;
+ if (tm->tcpm_ts_stamp) {
+ if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
+ (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
+ goto nla_put_failure;
+ if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
+ tm->tcpm_ts) < 0)
+ goto nla_put_failure;
+ }
+
+ {
+ int n = 0;
+
+ nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+ if (!nest)
+ goto nla_put_failure;
+ for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
+ u32 val = tm->tcpm_vals[i];
+
+ if (!val)
+ continue;
+ if (i == TCP_METRIC_RTT) {
+ if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (i == TCP_METRIC_RTTVAR) {
+ if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (nla_put_u32(msg, i + 1, val) < 0)
+ goto nla_put_failure;
+ n++;
+ }
+ if (n)
+ nla_nest_end(msg, nest);
+ else
+ nla_nest_cancel(msg, nest);
+ }
+
+ {
+ struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ tfom_copy[0] = tm->tcpm_fastopen;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+
+ tfom = tfom_copy;
+ if (tfom->mss &&
+ nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+ tfom->mss) < 0)
+ goto nla_put_failure;
+ if (tfom->syn_loss &&
+ (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+ tfom->syn_loss) < 0 ||
+ nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+ jiffies - tfom->last_syn_loss) < 0))
+ goto nla_put_failure;
+ if (tfom->cookie.len > 0 &&
+ nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+ tfom->cookie.len, tfom->cookie.val) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct tcp_metrics_block *tm)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &tcp_metrics_nl_family, NLM_F_MULTI,
+ TCP_METRICS_CMD_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (tcp_metrics_fill_info(skb, tm) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ unsigned int row, s_row = cb->args[0];
+ int s_col = cb->args[1], col = s_col;
+
+ for (row = s_row; row < max_rows; row++, s_col = 0) {
+ struct tcp_metrics_block *tm;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
+
+ rcu_read_lock();
+ for (col = 0, tm = rcu_dereference(hb->chain); tm;
+ tm = rcu_dereference(tm->tcpm_next), col++) {
+ if (!net_eq(tm_net(tm), net))
+ continue;
+ if (col < s_col)
+ continue;
+ if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = row;
+ cb->args[1] = col;
+ return skb->len;
+}
+
+static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional, int v4, int v6)
+{
+ struct nlattr *a;
+
+ a = info->attrs[v4];
+ if (a) {
+ addr->family = AF_INET;
+ addr->addr.a4 = nla_get_in_addr(a);
+ if (hash)
+ *hash = (__force unsigned int) addr->addr.a4;
+ return 0;
+ }
+ a = info->attrs[v6];
+ if (a) {
+ if (nla_len(a) != sizeof(struct in6_addr))
+ return -EINVAL;
+ addr->family = AF_INET6;
+ addr->addr.in6 = nla_get_in6_addr(a);
+ if (hash)
+ *hash = ipv6_addr_hash(&addr->addr.in6);
+ return 0;
+ }
+ return optional ? 1 : -EAFNOSUPPORT;
+}
+
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional)
+{
+ return __parse_nl_addr(info, addr, hash, optional,
+ TCP_METRICS_ATTR_ADDR_IPV4,
+ TCP_METRICS_ATTR_ADDR_IPV6);
+}
+
+static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
+{
+ return __parse_nl_addr(info, addr, NULL, 0,
+ TCP_METRICS_ATTR_SADDR_IPV4,
+ TCP_METRICS_ATTR_SADDR_IPV6);
+}
+
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct sk_buff *msg;
+ struct net *net = genl_info_net(info);
+ void *reply;
+ int ret;
+ bool src = true;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply)
+ goto nla_put_failure;
+
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ ret = -ESRCH;
+ rcu_read_lock();
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
+ ret = tcp_metrics_fill_info(msg, tm);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out_free;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ ret = -EMSGSIZE;
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static void tcp_metrics_flush_all(struct net *net)
+{
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash;
+ struct tcp_metrics_block *tm;
+ unsigned int row;
+
+ for (row = 0; row < max_rows; row++, hb++) {
+ struct tcp_metrics_block __rcu **pp;
+ spin_lock_bh(&tcp_metrics_lock);
+ pp = &hb->chain;
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (net_eq(tm_net(tm), net)) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ }
+}
+
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcpm_hash_bucket *hb;
+ struct tcp_metrics_block *tm;
+ struct tcp_metrics_block __rcu **pp;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net = genl_info_net(info);
+ int ret;
+ bool src = true, found = false;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 1);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ tcp_metrics_flush_all(net);
+ return 0;
+ }
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ hb = tcp_metrics_hash + hash;
+ pp = &hb->chain;
+ spin_lock_bh(&tcp_metrics_lock);
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ found = true;
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ if (!found)
+ return -ESRCH;
+ return 0;
+}
+
+static const struct genl_ops tcp_metrics_nl_ops[] = {
+ {
+ .cmd = TCP_METRICS_CMD_GET,
+ .doit = tcp_metrics_nl_cmd_get,
+ .dumpit = tcp_metrics_nl_dump,
+ .policy = tcp_metrics_nl_policy,
+ },
+ {
+ .cmd = TCP_METRICS_CMD_DEL,
+ .doit = tcp_metrics_nl_cmd_del,
+ .policy = tcp_metrics_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static unsigned int tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &tcpmhash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+ size_t size;
+ unsigned int slots;
+
+ if (!net_eq(net, &init_net))
+ return 0;
+
+ slots = tcpmhash_entries;
+ if (!slots) {
+ if (totalram_pages >= 128 * 1024)
+ slots = 16 * 1024;
+ else
+ slots = 8 * 1024;
+ }
+
+ tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
+
+ tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!tcp_metrics_hash)
+ tcp_metrics_hash = vzalloc(size);
+
+ if (!tcp_metrics_hash)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+ tcp_metrics_flush_all(net);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+ .init = tcp_net_metrics_init,
+ .exit = tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&tcp_net_metrics_ops);
+ if (ret < 0)
+ panic("Could not allocate the tcp_metrics hash table\n");
+
+ ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
+ tcp_metrics_nl_ops);
+ if (ret < 0)
+ panic("Could not register tcp_metrics generic netlink\n");
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 000000000..17e7339ee
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,830 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+int sysctl_tcp_syncookies __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_syncookies);
+
+int sysctl_tcp_abort_on_overflow __read_mostly;
+
+struct inet_timewait_death_row tcp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .hashinfo = &tcp_hashinfo,
+};
+EXPORT_SYMBOL_GPL(tcp_death_row);
+
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+ if (seq == s_win)
+ return true;
+ if (after(end_seq, s_win) && before(seq, e_win))
+ return true;
+ return seq == e_win && seq == end_seq;
+}
+
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+ const struct sk_buff *skb, int mib_idx)
+{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+ &tcptw->tw_last_oow_ack_time)) {
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK;
+ }
+
+ /* We are rate-limiting, so just release the tw sock and drop skb. */
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ * (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ * lifetime in the internet, which results in wrong conclusion, that
+ * it is set to catch "old duplicate segments" wandering out of their path.
+ * It is not quite correct. This timeout is calculated so that it exceeds
+ * maximal retransmission timeout enough to allow to lose one (or more)
+ * segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ * finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ * Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc. --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ const struct tcphdr *th)
+{
+ struct tcp_options_received tmp_opt;
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ bool paws_reject = false;
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+ tmp_opt.ts_recent = tcptw->tw_ts_recent;
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+ }
+ }
+
+ if (tw->tw_substate == TCP_FIN_WAIT2) {
+ /* Just repeat all the checks of tcp_rcv_state_process() */
+
+ /* Out of window, send ACK */
+ if (paws_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
+
+ if (th->rst)
+ goto kill;
+
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
+ goto kill_with_rst;
+
+ /* Dup ACK? */
+ if (!th->ack ||
+ !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* New data or FIN. If new data arrive after half-duplex close,
+ * reset.
+ */
+ if (!th->fin ||
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
+kill_with_rst:
+ inet_twsk_deschedule(tw);
+ inet_twsk_put(tw);
+ return TCP_TW_RST;
+ }
+
+ /* FIN arrived, enter true time-wait state. */
+ tw->tw_substate = TCP_TIME_WAIT;
+ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (tmp_opt.saw_tstamp) {
+ tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ }
+
+ if (tcp_death_row.sysctl_tw_recycle &&
+ tcptw->tw_ts_recent_stamp &&
+ tcp_tw_remember_stamp(tw))
+ inet_twsk_schedule(tw, tw->tw_timeout);
+ else
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ return TCP_TW_ACK;
+ }
+
+ /*
+ * Now real TIME-WAIT state.
+ *
+ * RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+
+ if (!paws_reject &&
+ (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+ /* In window segment, it may be only reset or bare ack. */
+
+ if (th->rst) {
+ /* This is TIME_WAIT assassination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if (sysctl_tcp_rfc1337 == 0) {
+kill:
+ inet_twsk_deschedule(tw);
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+ }
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+
+ if (tmp_opt.saw_tstamp) {
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = get_seconds();
+ }
+
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* Out of window segment.
+
+ All the segments are ACKed immediately.
+
+ The only exception is new SYN. We accept it, if it is
+ not old duplicate and we are not in danger to be killed
+ by delayed old duplicates. RFC check is that it has
+ newer sequence number works at rates <40Mbit/sec.
+ However, if paws works, it is reliable AND even more,
+ we even may relax silly seq space cutoff.
+
+ RED-PEN: we violate main RFC requirement, if this SYN will appear
+ old duplicate (i.e. we receive RST in reply to SYN-ACK),
+ we must return socket to time-wait state. It is not good,
+ but not fatal yet.
+ */
+
+ if (th->syn && !th->rst && !th->ack && !paws_reject &&
+ (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (tmp_opt.saw_tstamp &&
+ (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
+ if (isn == 0)
+ isn++;
+ TCP_SKB_CB(skb)->tcp_tw_isn = isn;
+ return TCP_TW_SYN;
+ }
+
+ if (paws_reject)
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+
+ if (!th->rst) {
+ /* In this case we must reset the TIMEWAIT timer.
+ *
+ * If it is ACKless SYN it may be both old duplicate
+ * and new good SYN with random sequence number <rcv_nxt.
+ * Do not reschedule in the last case.
+ */
+ if (paws_reject || th->ack)
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
+ }
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+EXPORT_SYMBOL(tcp_timewait_state_process);
+
+/*
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_timewait_sock *tw;
+ bool recycle_ok = false;
+
+ if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ recycle_ok = tcp_remember_stamp(sk);
+
+ tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+
+ if (tw) {
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+ struct inet_sock *inet = inet_sk(sk);
+
+ tw->tw_transparent = inet->transparent;
+ tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ tcptw->tw_rcv_nxt = tp->rcv_nxt;
+ tcptw->tw_snd_nxt = tp->snd_nxt;
+ tcptw->tw_rcv_wnd = tcp_receive_window(tp);
+ tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tcptw->tw_ts_offset = tp->tsoffset;
+ tcptw->tw_last_oow_ack_time = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == PF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ tw->tw_tclass = np->tclass;
+ tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+ tw->tw_ipv6only = sk->sk_ipv6only;
+ }
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+ /*
+ * The timewait bucket does not have the key DB from the
+ * sock structure. We just make a quick copy of the
+ * md5 key being used (if indeed we are using one)
+ * so the timewait ack generating code has the key.
+ */
+ do {
+ struct tcp_md5sig_key *key;
+ tcptw->tw_md5_key = NULL;
+ key = tp->af_specific->md5_lookup(sk, sk);
+ if (key) {
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
+ BUG();
+ }
+ } while (0);
+#endif
+
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+
+ /* Get the TIME_WAIT timeout firing. */
+ if (timeo < rto)
+ timeo = rto;
+
+ if (recycle_ok) {
+ tw->tw_timeout = rto;
+ } else {
+ tw->tw_timeout = TCP_TIMEWAIT_LEN;
+ if (state == TCP_TIME_WAIT)
+ timeo = TCP_TIMEWAIT_LEN;
+ }
+
+ inet_twsk_schedule(tw, timeo);
+ inet_twsk_put(tw);
+ } else {
+ /* Sorry, if we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ }
+
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+}
+
+void tcp_twsk_destructor(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
+ if (twsk->tw_md5_key)
+ kfree_rcu(twsk->tw_md5_key, rcu);
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+
+void tcp_openreq_init_rwin(struct request_sock *req,
+ struct sock *sk, struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+ int mss = dst_metric_advmss(dst);
+
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ /* Set this up on the first call only */
+ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+ ireq->rcv_wscale = rcv_wscale;
+}
+EXPORT_SYMBOL(tcp_openreq_init_rwin);
+
+static void tcp_ecn_openreq_child(struct tcp_sock *tp,
+ const struct request_sock *req)
+{
+ tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+}
+
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ bool ca_got_dst = false;
+
+ if (ca_key != TCP_CA_UNSPEC) {
+ const struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ ca_got_dst = true;
+ }
+ rcu_read_unlock();
+ }
+
+ /* If no valid choice made yet, assign current system default ca. */
+ if (!ca_got_dst &&
+ (!icsk->icsk_ca_setsockopt ||
+ !try_module_get(icsk->icsk_ca_ops->owner)))
+ tcp_assign_congestion_control(sk);
+
+ tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+{
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+
+ if (newsk) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+ struct tcp_sock *newtp = tcp_sk(newsk);
+
+ /* Now setup tcp_sock */
+ newtp->pred_flags = 0;
+
+ newtp->rcv_wup = newtp->copied_seq =
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+
+ newtp->snd_sml = newtp->snd_una =
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
+
+ tcp_prequeue_init(newtp);
+ INIT_LIST_HEAD(&newtp->tsq_node);
+
+ tcp_init_wl(newtp, treq->rcv_isn);
+
+ newtp->srtt_us = 0;
+ newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+
+ newtp->packets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->sacked_out = 0;
+ newtp->fackets_out = 0;
+ newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_enable_early_retrans(newtp);
+ newtp->tlp_high_seq = 0;
+ newtp->lsndtime = treq->snt_synack;
+ newtp->last_oow_ack_time = 0;
+ newtp->total_retrans = req->num_retrans;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = TCP_INIT_CWND;
+ newtp->snd_cwnd_cnt = 0;
+
+ tcp_init_xmit_timers(newsk);
+ __skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+
+ newtp->rx_opt.saw_tstamp = 0;
+
+ newtp->rx_opt.dsack = 0;
+ newtp->rx_opt.num_sacks = 0;
+
+ newtp->urg_data = 0;
+
+ if (sock_flag(newsk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
+
+ newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+ if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(newtp);
+ }
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_ssthresh = req->rcv_wnd;
+ newtp->rcv_wnd = req->rcv_wnd;
+ newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+ if (newtp->rx_opt.wscale_ok) {
+ newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+ newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+ } else {
+ newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp, 65535U);
+ }
+ newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
+ newtp->rx_opt.snd_wscale);
+ newtp->max_window = newtp->snd_wnd;
+
+ if (newtp->rx_opt.tstamp_ok) {
+ newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent_stamp = get_seconds();
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ newtp->tsoffset = 0;
+#ifdef CONFIG_TCP_MD5SIG
+ newtp->md5sig_info = NULL; /*XXX*/
+ if (newtp->af_specific->md5_lookup(sk, newsk))
+ newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+ newtp->rx_opt.mss_clamp = req->mss;
+ tcp_ecn_openreq_child(newtp, req);
+ newtp->fastopen_rsk = NULL;
+ newtp->syn_data_acked = 0;
+
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+ }
+ return newsk;
+}
+EXPORT_SYMBOL(tcp_create_openreq_child);
+
+/*
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
+ */
+
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ bool fastopen)
+{
+ struct tcp_options_received tmp_opt;
+ struct sock *child;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ bool paws_reject = false;
+
+ BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2)) {
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.ts_recent = req->ts_recent;
+ /* We do not store true stamp, but it is not required,
+ * it can be estimated (approximately)
+ * from another data.
+ */
+ tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+ }
+ }
+
+ /* Check for pure retransmitted SYN. */
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
+ flg == TCP_FLAG_SYN &&
+ !paws_reject) {
+ /*
+ * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+ * this case on figure 6 and figure 8, but formal
+ * protocol description says NOTHING.
+ * To be more exact, it says that we should send ACK,
+ * because this segment (at least, if it has no data)
+ * is out of window.
+ *
+ * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+ * describe SYN-RECV state. All the description
+ * is wrong, we cannot believe to it and should
+ * rely only on common sense and implementation
+ * experience.
+ *
+ * Enforce "SYN-ACK" according to figure 8, figure 6
+ * of RFC793, fixed by RFC1122.
+ *
+ * Note that even if there is new data in the SYN packet
+ * they will be thrown away too.
+ *
+ * Reset timer after retransmitting SYNACK, similar to
+ * the idea of fast retransmit in recovery.
+ */
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time) &&
+
+ !inet_rtx_syn_ack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer, expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
+ return NULL;
+ }
+
+ /* Further reproduces section "SEGMENT ARRIVES"
+ for state SYN-RECEIVED of RFC793.
+ It is broken, however, it does not work only
+ when SYNs are crossed.
+
+ You would think that SYN crossing is impossible here, since
+ we should have a SYN_SENT socket (from connect()) on our end,
+ but this is not true if the crossed SYNs were sent to both
+ ends by a malicious third party. We must defend against this,
+ and to do that we first verify the ACK (as per RFC793, page
+ 36) and reset if it is invalid. Is this a true full defense?
+ To convince ourselves, let us consider a way in which the ACK
+ test can still pass in this 'malicious crossed SYNs' case.
+ Malicious sender sends identical SYNs (and thus identical sequence
+ numbers) to both A and B:
+
+ A: gets SYN, seq=7
+ B: gets SYN, seq=7
+
+ By our good fortune, both A and B select the same initial
+ send sequence number of seven :-)
+
+ A: sends SYN|ACK, seq=7, ack_seq=8
+ B: sends SYN|ACK, seq=7, ack_seq=8
+
+ So we are now A eating this SYN|ACK, ACK test passes. So
+ does sequence test, SYN is truncated, and thus we consider
+ it a bare ACK.
+
+ If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+ bare ACK. Otherwise, we create an established connection. Both
+ ends (listening sockets) accept the new incoming connection and try
+ to talk to each other. 8-)
+
+ Note: This case is both harmless, and rare. Possibility is about the
+ same as us discovering intelligent life on another plant tomorrow.
+
+ But generally, we should (RFC lies!) to accept ACK
+ from SYNACK both here and in tcp_rcv_state_process().
+ tcp_rcv_state_process() does not, hence, we do not too.
+
+ Note that the case is absolutely generic:
+ we cannot optimize anything here without
+ violating protocol. All the checks must be made
+ before attempt to create socket.
+ */
+
+ /* RFC793 page 36: "If the connection is in any non-synchronized state ...
+ * and the incoming segment acknowledges something not yet
+ * sent (the segment carries an unacceptable ACK) ...
+ * a reset is sent."
+ *
+ * Invalid ACK: reset will be sent by listening socket.
+ * Note that the ACK validity check for a Fast Open socket is done
+ * elsewhere and is checked directly against the child socket rather
+ * than req because user data may have been sent out.
+ */
+ if ((flg & TCP_FLAG_ACK) && !fastopen &&
+ (TCP_SKB_CB(skb)->ack_seq !=
+ tcp_rsk(req)->snt_isn + 1))
+ return sk;
+
+ /* Also, it would be not so bad idea to check rcv_tsecr, which
+ * is essentially ACK extension and too early or too late values
+ * should cause reset in unsynchronized states.
+ */
+
+ /* RFC793: "first check sequence number". */
+
+ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ /* Out of window: send ACK and drop. */
+ if (!(flg & TCP_FLAG_RST))
+ req->rsk_ops->send_ack(sk, skb, req);
+ if (paws_reject)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ return NULL;
+ }
+
+ /* In sequence, PAWS is OK. */
+
+ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+ req->ts_recent = tmp_opt.rcv_tsval;
+
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
+ /* Truncate SYN, it is out of window starting
+ at tcp_rsk(req)->rcv_isn + 1. */
+ flg &= ~TCP_FLAG_SYN;
+ }
+
+ /* RFC793: "second check the RST bit" and
+ * "fourth, check the SYN bit"
+ */
+ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ goto embryonic_reset;
+ }
+
+ /* ACK sequence verified above, just make sure ACK is
+ * set. If ACK not set, just silently drop the packet.
+ *
+ * XXX (TFO) - if we ever allow "data after SYN", the
+ * following check needs to be removed.
+ */
+ if (!(flg & TCP_FLAG_ACK))
+ return NULL;
+
+ /* For Fast Open no more processing is needed (sk is the
+ * child socket).
+ */
+ if (fastopen)
+ return sk;
+
+ /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
+ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ inet_rsk(req)->acked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+ return NULL;
+ }
+
+ /* OK, ACK is valid, create big socket and
+ * feed this segment to it. It will repeat all
+ * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ * ESTABLISHED STATE. If it will be dropped after
+ * socket is created, wait for troubles.
+ */
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (!child)
+ goto listen_overflow;
+
+ inet_csk_reqsk_queue_drop(sk, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ /* Warning: caller must not call reqsk_put(req);
+ * child stole last reference on it.
+ */
+ return child;
+
+listen_overflow:
+ if (!sysctl_tcp_abort_on_overflow) {
+ inet_rsk(req)->acked = 1;
+ return NULL;
+ }
+
+embryonic_reset:
+ if (!(flg & TCP_FLAG_RST)) {
+ /* Received a bad SYN pkt - for TFO We try not to reset
+ * the local connection unless it's really necessary to
+ * avoid becoming vulnerable to outside attack aiming at
+ * resetting legit local connections.
+ */
+ req->rsk_ops->send_reset(sk, skb);
+ } else if (fastopen) { /* received a valid RST pkt */
+ reqsk_fastopen_remove(sk, req, true);
+ tcp_reset(sk);
+ }
+ if (!fastopen) {
+ inet_csk_reqsk_queue_drop(sk, req);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_check_req);
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ int state = child->sk_state;
+
+ if (!sock_owned_by_user(child)) {
+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
+ skb->len);
+ /* Wakeup parent, send SIGIO */
+ if (state == TCP_SYN_RECV && child->sk_state != state)
+ parent->sk_data_ready(parent);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ __sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ return ret;
+}
+EXPORT_SYMBOL(tcp_child_process);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
new file mode 100644
index 000000000..3f7c2fca5
--- /dev/null
+++ b/net/ipv4/tcp_offload.c
@@ -0,0 +1,327 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * TCPv4 GSO/GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+
+static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
+ unsigned int seq, unsigned int mss)
+{
+ while (skb) {
+ if (before(ts_seq, seq + mss)) {
+ skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
+ skb_shinfo(skb)->tskey = ts_seq;
+ return;
+ }
+
+ skb = skb->next;
+ seq += mss;
+ }
+}
+
+static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ /* Set up checksum pseudo header, usually expect stack to
+ * have done this already.
+ */
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+ }
+
+ return tcp_gso_segment(skb, features);
+}
+
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
+ struct tcphdr *th;
+ unsigned int thlen;
+ unsigned int seq;
+ __be32 delta;
+ unsigned int oldlen;
+ unsigned int mss;
+ struct sk_buff *gso_skb = skb;
+ __sum16 newcheck;
+ bool ooo_okay, copy_destructor;
+
+ th = tcp_hdr(skb);
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = (u16)~skb->len;
+ __skb_pull(skb, thlen);
+
+ mss = tcp_skb_mss(skb);
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ 0) ||
+ !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ copy_destructor = gso_skb->destructor == tcp_wfree;
+ ooo_okay = gso_skb->ooo_okay;
+ /* All segments but the first should have ooo_okay cleared */
+ skb->ooo_okay = 0;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs))
+ goto out;
+
+ /* Only first segment might have ooo_okay set */
+ segs->ooo_okay = ooo_okay;
+
+ delta = htonl(oldlen + (thlen + mss));
+
+ skb = segs;
+ th = tcp_hdr(skb);
+ seq = ntohl(th->seq);
+
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
+ tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+
+ newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+
+ do {
+ th->fin = th->psh = 0;
+ th->check = newcheck;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = gso_make_checksum(skb, ~th->check);
+
+ seq += mss;
+ if (copy_destructor) {
+ skb->destructor = gso_skb->destructor;
+ skb->sk = gso_skb->sk;
+ sum_truesize += skb->truesize;
+ }
+ skb = skb->next;
+ th = tcp_hdr(skb);
+
+ th->seq = htonl(seq);
+ th->cwr = 0;
+ } while (skb->next);
+
+ /* Following permits TCP Small Queues to work well with GSO :
+ * The callback to TCP stack will be called at the time last frag
+ * is freed at TX completion, and not right now when gso_skb
+ * is freed by GSO engine
+ */
+ if (copy_destructor) {
+ swap(gso_skb->sk, skb->sk);
+ swap(gso_skb->destructor, skb->destructor);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
+ }
+
+ delta = htonl(oldlen + (skb_tail_pointer(skb) -
+ skb_transport_header(skb)) +
+ skb->data_len);
+ th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = gso_make_checksum(skb, ~th->check);
+out:
+ return segs;
+}
+
+struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th;
+ struct tcphdr *th2;
+ unsigned int len;
+ unsigned int thlen;
+ __be32 flags;
+ unsigned int mss = 1;
+ unsigned int hlen;
+ unsigned int off;
+ int flush = 1;
+ int i;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ skb_gro_pull(skb, thlen);
+
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
+
+ for (; (p = *head); head = &p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ goto found;
+ }
+
+ goto out_check_final;
+
+found:
+ /* Include the IP ID check below from the inner most IP hdr */
+ flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+ flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+ for (i = sizeof(*th); i < thlen; i += 4)
+ flush |= *(u32 *)((u8 *)th + i) ^
+ *(u32 *)((u8 *)th2 + i);
+
+ mss = tcp_skb_mss(p);
+
+ flush |= (len - 1) >= mss;
+ flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+
+ if (flush || skb_gro_receive(head, skb)) {
+ mss = 1;
+ goto out_check_final;
+ }
+
+ p = *head;
+ th2 = tcp_hdr(p);
+ tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+ flush = len < mss;
+ flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+ TCP_FLAG_RST | TCP_FLAG_SYN |
+ TCP_FLAG_FIN));
+
+ if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+ pp = head;
+
+out:
+ NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+ return pp;
+}
+
+int tcp_gro_complete(struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)th - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (th->cwr)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+ inet_gro_compute_pseudo)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ return tcp_gro_receive(head, skb);
+}
+
+static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+ iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+
+ return tcp_gro_complete(skb);
+}
+
+static const struct net_offload tcpv4_offload = {
+ .callbacks = {
+ .gso_segment = tcp4_gso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+};
+
+int __init tcpv4_offload_init(void)
+{
+ return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 000000000..d80faa151
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,3525 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes: Pedro Roque : Retransmit queue handled by TCP.
+ * : Fragmentation on mtu decrease
+ * : Segment collapse on retransmit
+ * : AF independence
+ *
+ * Linus Torvalds : send_delayed_ack
+ * David S. Miller : Charge memory using the right skb
+ * during syn/ack processing.
+ * David S. Miller : Output engine completely rewritten.
+ * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
+ * Cacophonix Gaul : draft-minshall-nagle-01
+ * J Hadi Salim : ECN support
+ *
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <net/tcp.h>
+
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse __read_mostly = 1;
+
+/* People can turn this on to work with those rare, broken TCPs that
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+
+/* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume. Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+int sysctl_tcp_tso_win_divisor __read_mostly = 3;
+
+/* By default, RFC2861 behavior. */
+int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp);
+
+/* Account for new data that has been sent to the network. */
+static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int prior_packets = tp->packets_out;
+
+ tcp_advance_send_head(sk, skb);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+
+ tp->packets_out += tcp_skb_pcount(skb);
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ tcp_rearm_rto(sk);
+ }
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+ tcp_skb_pcount(skb));
+}
+
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+ return tp->snd_nxt;
+ else
+ return tcp_wnd_end(tp);
+}
+
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ * attached devices, because some buggy hosts are confused by
+ * large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ * hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ * This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ * probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ int mss = tp->advmss;
+
+ if (dst) {
+ unsigned int metric = dst_metric_advmss(dst);
+
+ if (metric < mss) {
+ mss = metric;
+ tp->advmss = mss;
+ }
+ }
+
+ return (__u16)mss;
+}
+
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta = tcp_time_stamp - tp->lsndtime;
+ u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 cwnd = tp->snd_cwnd;
+
+ tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
+
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ restart_cwnd = min(restart_cwnd, cwnd);
+
+ while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
+ cwnd >>= 1;
+ tp->snd_cwnd = max(cwnd, restart_cwnd);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_used = 0;
+}
+
+/* Congestion state accounting after a packet has been sent. */
+static void tcp_event_data_sent(struct tcp_sock *tp,
+ struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const u32 now = tcp_time_stamp;
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (sysctl_tcp_slow_start_after_idle &&
+ (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
+ tcp_cwnd_restart(sk, __sk_dst_get(sk));
+
+ tp->lsndtime = now;
+
+ /* If it is a reply for ato after last received
+ * packet, enter pingpong mode.
+ */
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
+ (!dst || !dst_metric(dst, RTAX_QUICKACK)))
+ icsk->icsk_ack.pingpong = 1;
+}
+
+/* Account for an ACK we sent. */
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+{
+ tcp_dec_quickack_mode(sk, pkts);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+
+u32 tcp_default_init_rwnd(u32 mss)
+{
+ /* Initial receive window should be twice of TCP_INIT_CWND to
+ * enable proper sending of new unsent data during fast recovery
+ * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
+ * limit when mss is larger than 1460.
+ */
+ u32 init_rwnd = TCP_INIT_CWND * 2;
+
+ if (mss > 1460)
+ init_rwnd = max((1460 * init_rwnd) / mss, 2U);
+ return init_rwnd;
+}
+
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(int __space, __u32 mss,
+ __u32 *rcv_wnd, __u32 *window_clamp,
+ int wscale_ok, __u8 *rcv_wscale,
+ __u32 init_rcv_wnd)
+{
+ unsigned int space = (__space < 0 ? 0 : __space);
+
+ /* If no clamp set the clamp to the max possible scaled window */
+ if (*window_clamp == 0)
+ (*window_clamp) = (65535 << 14);
+ space = min(*window_clamp, space);
+
+ /* Quantize space offering to a multiple of mss if possible. */
+ if (space > mss)
+ space = (space / mss) * mss;
+
+ /* NOTE: offering an initial window larger than 32767
+ * will break some buggy TCP stacks. If the admin tells us
+ * it is likely we could be speaking with such a buggy stack
+ * we will truncate our initial window offering to 32K-1
+ * unless the remote has sent us a window scaling option,
+ * which we interpret as a sign the remote TCP is not
+ * misinterpreting the window field as a signed quantity.
+ */
+ if (sysctl_tcp_workaround_signed_windows)
+ (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+ else
+ (*rcv_wnd) = space;
+
+ (*rcv_wscale) = 0;
+ if (wscale_ok) {
+ /* Set window scaling on max possible window
+ * See RFC1323 for an explanation of the limit to 14
+ */
+ space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ space = min_t(u32, space, *window_clamp);
+ while (space > 65535 && (*rcv_wscale) < 14) {
+ space >>= 1;
+ (*rcv_wscale)++;
+ }
+ }
+
+ if (mss > (1 << *rcv_wscale)) {
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+ }
+
+ /* Set the clamp no higher than max representable value */
+ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+}
+EXPORT_SYMBOL(tcp_select_initial_window);
+
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied. The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static u16 tcp_select_window(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 old_win = tp->rcv_wnd;
+ u32 cur_win = tcp_receive_window(tp);
+ u32 new_win = __tcp_select_window(sk);
+
+ /* Never shrink the offered window */
+ if (new_win < cur_win) {
+ /* Danger Will Robinson!
+ * Don't update rcv_wup/rcv_wnd here or else
+ * we will not be able to advertise a zero
+ * window in time. --DaveM
+ *
+ * Relax Will Robinson.
+ */
+ if (new_win == 0)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPWANTZEROWINDOWADV);
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ }
+ tp->rcv_wnd = new_win;
+ tp->rcv_wup = tp->rcv_nxt;
+
+ /* Make sure we do not exceed the maximum possible
+ * scaled window.
+ */
+ if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
+ new_win = min(new_win, MAX_TCP_WINDOW);
+ else
+ new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rx_opt.rcv_wscale;
+
+ /* If we advertise zero window, disable fast path. */
+ if (new_win == 0) {
+ tp->pred_flags = 0;
+ if (old_win)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTOZEROWINDOWADV);
+ } else if (old_win == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+ }
+
+ return new_win;
+}
+
+/* Packet ECN state for a SYN-ACK */
+static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
+ if (!(tp->ecn_flags & TCP_ECN_OK))
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+}
+
+/* Packet ECN state for a SYN. */
+static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk);
+
+ if (!use_ecn) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ use_ecn = true;
+ }
+
+ tp->ecn_flags = 0;
+
+ if (use_ecn) {
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
+ tp->ecn_flags = TCP_ECN_OK;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
+}
+
+static void
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
+ struct sock *sk)
+{
+ if (inet_rsk(req)->ecn_ok) {
+ th->ece = 1;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
+}
+
+/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
+ * be sent.
+ */
+static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ int tcp_header_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->ecn_flags & TCP_ECN_OK) {
+ /* Not-retransmitted data segment: set ECT and inject CWR. */
+ if (skb->len != tcp_header_len &&
+ !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+ INET_ECN_xmit(sk);
+ if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+ tcp_hdr(skb)->cwr = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ }
+ } else if (!tcp_ca_needs_ecn(sk)) {
+ /* ACK or retransmitted segment: clear ECT|CE */
+ INET_ECN_dontxmit(sk);
+ }
+ if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+ tcp_hdr(skb)->ece = 1;
+ }
+}
+
+/* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+
+ TCP_SKB_CB(skb)->tcp_flags = flags;
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ tcp_skb_pcount_set(skb, 1);
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
+
+ TCP_SKB_CB(skb)->seq = seq;
+ if (flags & (TCPHDR_SYN | TCPHDR_FIN))
+ seq++;
+ TCP_SKB_CB(skb)->end_seq = seq;
+}
+
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+{
+ return tp->snd_una != tp->snd_up;
+}
+
+#define OPTION_SACK_ADVERTISE (1 << 0)
+#define OPTION_TS (1 << 1)
+#define OPTION_MD5 (1 << 2)
+#define OPTION_WSCALE (1 << 3)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
+
+struct tcp_out_options {
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
+ u8 ws; /* window scale, 0 to disable */
+ u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 hash_size; /* bytes in hash_location */
+ __u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+};
+
+/* Write previously computed TCP options to the packet.
+ *
+ * Beware: Something in the Internet is very sensitive to the ordering of
+ * TCP options, we learned this through the hard way, so be careful here.
+ * Luckily we can at least blame others for their non-compliance but from
+ * inter-operability perspective it seems that we're somewhat stuck with
+ * the ordering which we have been using if we want to keep working with
+ * those broken things (not that it currently hurts anybody as there isn't
+ * particular reason why the ordering would need to be changed).
+ *
+ * At least SACK_PERM as the first option is known to lead to a disaster
+ * (but it may well be that other scenarios fail similarly).
+ */
+static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+ struct tcp_out_options *opts)
+{
+ u16 options = opts->options; /* mungable copy */
+
+ if (unlikely(OPTION_MD5 & options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+ /* overload cookie hash location */
+ opts->hash_location = (__u8 *)ptr;
+ ptr += 4;
+ }
+
+ if (unlikely(opts->mss)) {
+ *ptr++ = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ opts->mss);
+ }
+
+ if (likely(OPTION_TS & options)) {
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+ *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+ (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ options &= ~OPTION_SACK_ADVERTISE;
+ } else {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ }
+ *ptr++ = htonl(opts->tsval);
+ *ptr++ = htonl(opts->tsecr);
+ }
+
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) |
+ TCPOLEN_SACK_PERM);
+ }
+
+ if (unlikely(OPTION_WSCALE & options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_WINDOW << 16) |
+ (TCPOLEN_WINDOW << 8) |
+ opts->ws);
+ }
+
+ if (unlikely(opts->num_sack_blocks)) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ?
+ tp->duplicate_sack : tp->selective_acks;
+ int this_sack;
+
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
+ TCPOLEN_SACK_PERBLOCK)));
+
+ for (this_sack = 0; this_sack < opts->num_sack_blocks;
+ ++this_sack) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+
+ tp->rx_opt.dsack = 0;
+ }
+
+ if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+ struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+ u8 *p = (u8 *)ptr;
+ u32 len; /* Fast Open option length */
+
+ if (foc->exp) {
+ len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+ p += TCPOLEN_EXP_FASTOPEN_BASE;
+ } else {
+ len = TCPOLEN_FASTOPEN_BASE + foc->len;
+ *p++ = TCPOPT_FASTOPEN;
+ *p++ = len;
+ }
+
+ memcpy(p, foc->val, foc->len);
+ if ((len & 3) == 2) {
+ p[foc->len] = TCPOPT_NOP;
+ p[foc->len + 1] = TCPOPT_NOP;
+ }
+ ptr += (len + 3) >> 2;
+ }
+}
+
+/* Compute TCP options for SYN packets. This is not the final
+ * network wire format yet.
+ */
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct tcp_md5sig_key **md5)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+
+#ifdef CONFIG_TCP_MD5SIG
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ }
+#else
+ *md5 = NULL;
+#endif
+
+ /* We always get an MSS option. The option bytes which will be seen in
+ * normal data packets should timestamps be used, must be in the MSS
+ * advertised. But we subtract them from tp->mss_cache so that
+ * calculations in tcp_sendmsg are simpler etc. So account for this
+ * fact here if necessary. If we don't do this correctly, as a
+ * receiver we won't recognize data packets as being full sized when we
+ * should, and thus we won't abide by the delayed ACK rules correctly.
+ * SACKs don't matter, we never delay an ACK when we have any of those
+ * going out. */
+ opts->mss = tcp_advertise_mss(sk);
+ remaining -= TCPOLEN_MSS_ALIGNED;
+
+ if (likely(sysctl_tcp_timestamps && !*md5)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
+ opts->tsecr = tp->rx_opt.ts_recent;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
+ }
+ if (likely(sysctl_tcp_window_scaling)) {
+ opts->ws = tp->rx_opt.rcv_wscale;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
+ }
+ if (likely(sysctl_tcp_sack)) {
+ opts->options |= OPTION_SACK_ADVERTISE;
+ if (unlikely(!(OPTION_TS & opts->options)))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
+
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = fastopen->cookie.len;
+
+ need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = &fastopen->cookie;
+ remaining -= need;
+ tp->syn_fastopen = 1;
+ tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
+ }
+ }
+
+ return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Set up TCP options for SYN-ACKs. */
+static unsigned int tcp_synack_options(struct sock *sk,
+ struct request_sock *req,
+ unsigned int mss, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ const struct tcp_md5sig_key *md5,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+ /* We can't fit any SACK blocks in a packet with MD5 + TS
+ * options. There was discussion about disabling SACK
+ * rather than TS in order to fit in better with old,
+ * buggy kernels, but that was deemed to be unnecessary.
+ */
+ ireq->tstamp_ok &= !ireq->sack_ok;
+ }
+#endif
+
+ /* We always send an MSS option. */
+ opts->mss = mss;
+ remaining -= TCPOLEN_MSS_ALIGNED;
+
+ if (likely(ireq->wscale_ok)) {
+ opts->ws = ireq->rcv_wscale;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
+ }
+ if (likely(ireq->tstamp_ok)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = tcp_skb_timestamp(skb);
+ opts->tsecr = req->ts_recent;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
+ }
+ if (likely(ireq->sack_ok)) {
+ opts->options |= OPTION_SACK_ADVERTISE;
+ if (unlikely(!ireq->tstamp_ok))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
+ if (foc != NULL && foc->len >= 0) {
+ u32 need = foc->len;
+
+ need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = foc;
+ remaining -= need;
+ }
+ }
+
+ return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Compute TCP options for ESTABLISHED sockets. This is not the
+ * final wire format yet.
+ */
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct tcp_md5sig_key **md5)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int size = 0;
+ unsigned int eff_sacks;
+
+ opts->options = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (unlikely(*md5)) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ }
+#else
+ *md5 = NULL;
+#endif
+
+ if (likely(tp->rx_opt.tstamp_ok)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
+ opts->tsecr = tp->rx_opt.ts_recent;
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ }
+
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
+ const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ opts->num_sack_blocks =
+ min_t(unsigned int, eff_sacks,
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+ TCPOLEN_SACK_PERBLOCK);
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ }
+
+ return size;
+}
+
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+ struct tasklet_struct tasklet;
+ struct list_head head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+ tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ 0, GFP_ATOMIC);
+}
+/*
+ * One tasklet per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transferring tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+ struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+ LIST_HEAD(list);
+ unsigned long flags;
+ struct list_head *q, *n;
+ struct tcp_sock *tp;
+ struct sock *sk;
+
+ local_irq_save(flags);
+ list_splice_init(&tsq->head, &list);
+ local_irq_restore(flags);
+
+ list_for_each_safe(q, n, &list) {
+ tp = list_entry(q, struct tcp_sock, tsq_node);
+ list_del(&tp->tsq_node);
+
+ sk = (struct sock *)tp;
+ bh_lock_sock(sk);
+
+ if (!sock_owned_by_user(sk)) {
+ tcp_tsq_handler(sk);
+ } else {
+ /* defer the work to tcp_release_cb() */
+ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ }
+ bh_unlock_sock(sk);
+
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+ sk_free(sk);
+ }
+}
+
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
+ (1UL << TCP_WRITE_TIMER_DEFERRED) | \
+ (1UL << TCP_DELACK_TIMER_DEFERRED) | \
+ (1UL << TCP_MTU_REDUCED_DEFERRED))
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nflags;
+
+ /* perform an atomic operation only if at least one flag is set */
+ do {
+ flags = tp->tsq_flags;
+ if (!(flags & TCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~TCP_DEFERRED_ALL;
+ } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+
+ if (flags & (1UL << TCP_TSQ_DEFERRED))
+ tcp_tsq_handler(sk);
+
+ /* Here begins the tricky part :
+ * We are called from release_sock() with :
+ * 1) BH disabled
+ * 2) sk_lock.slock spinlock held
+ * 3) socket owned by us (sk->sk_lock.owned == 1)
+ *
+ * But following code is meant to be called from BH handlers,
+ * so we should keep BH disabled, but early release socket ownership
+ */
+ sock_release_ownership(sk);
+
+ if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ tcp_write_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ tcp_delack_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
+ __sock_put(sk);
+ }
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+ INIT_LIST_HEAD(&tsq->head);
+ tasklet_init(&tsq->tasklet,
+ tcp_tasklet_func,
+ (unsigned long)tsq);
+ }
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We can't xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int wmem;
+
+ /* Keep one reference on sk_wmem_alloc.
+ * Will be released by sk_free() from here or tcp_tasklet_func()
+ */
+ wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+ /* If this softirq is serviced by ksoftirqd, we are likely under stress.
+ * Wait until our queues (qdisc + devices) are drained.
+ * This gives :
+ * - less callbacks to tcp_write_xmit(), reducing stress (batches)
+ * - chance for incoming ACK (processed by another cpu maybe)
+ * to migrate this flow (skb->ooo_okay will be eventually set)
+ */
+ if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ goto out;
+
+ if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+ !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+ unsigned long flags;
+ struct tsq_tasklet *tsq;
+
+ /* queue this socket to tasklet queue */
+ local_irq_save(flags);
+ tsq = this_cpu_ptr(&tsq_tasklet);
+ list_add(&tp->tsq_node, &tsq->head);
+ tasklet_schedule(&tsq->tasklet);
+ local_irq_restore(flags);
+ return;
+ }
+out:
+ sk_free(sk);
+}
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg(). This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet;
+ struct tcp_sock *tp;
+ struct tcp_skb_cb *tcb;
+ struct tcp_out_options opts;
+ unsigned int tcp_options_size, tcp_header_size;
+ struct tcp_md5sig_key *md5;
+ struct tcphdr *th;
+ int err;
+
+ BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+ if (clone_it) {
+ skb_mstamp_get(&skb->skb_mstamp);
+
+ if (unlikely(skb_cloned(skb)))
+ skb = pskb_copy(skb, gfp_mask);
+ else
+ skb = skb_clone(skb, gfp_mask);
+ if (unlikely(!skb))
+ return -ENOBUFS;
+ }
+
+ inet = inet_sk(sk);
+ tp = tcp_sk(sk);
+ tcb = TCP_SKB_CB(skb);
+ memset(&opts, 0, sizeof(opts));
+
+#ifdef TCP_STEALTH
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN &&
+ tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+ skb->skb_mstamp = tp->stealth.mstamp;
+ }
+#endif
+
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+ tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+ else
+ tcp_options_size = tcp_established_options(sk, skb, &opts,
+ &md5);
+ tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
+
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
+
+ /* if no packet is in qdisc/device queue, then allow XPS to select
+ * another queue. We can be called from tcp_tsq_handler()
+ * which holds one reference to sk_wmem_alloc.
+ *
+ * TODO: Ideally, in-flight pure ACK packets should not matter here.
+ * One way to get this would be to set skb->truesize = 2 on them.
+ */
+ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+
+ skb_push(skb, tcp_header_size);
+ skb_reset_transport_header(skb);
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
+ skb_set_hash_from_sk(skb, sk);
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+
+ /* Build TCP header and checksum it. */
+ th = tcp_hdr(skb);
+ th->source = inet->inet_sport;
+ th->dest = inet->inet_dport;
+ th->seq = htonl(tcb->seq);
+ th->ack_seq = htonl(tp->rcv_nxt);
+ *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
+ tcb->tcp_flags);
+
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(min(tp->rcv_wnd, 65535U));
+ } else {
+ th->window = htons(tcp_select_window(sk));
+ }
+ th->check = 0;
+ th->urg_ptr = 0;
+
+ /* The urg_mode check is necessary during a below snd_una win probe */
+ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+ if (before(tp->snd_up, tcb->seq + 0x10000)) {
+ th->urg_ptr = htons(tp->snd_up - tcb->seq);
+ th->urg = 1;
+ } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+ th->urg_ptr = htons(0xFFFF);
+ th->urg = 1;
+ }
+ }
+
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
+ tcp_ecn_send(sk, skb, tcp_header_size);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Calculate the MD5 hash, as we have all we need now */
+ if (md5) {
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ tp->af_specific->calc_md5_hash(opts.hash_location,
+ md5, sk, skb);
+ }
+#endif
+
+ icsk->icsk_af_ops->send_check(sk, skb);
+
+ if (likely(tcb->tcp_flags & TCPHDR_ACK))
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, sk);
+
+ if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
+ tcp_skb_pcount(skb));
+
+ /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+
+ /* Our usage of tstamp should remain private */
+ skb->tstamp.tv64 = 0;
+
+ /* Cleanup our debris for IP stacks */
+ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+
+ if (likely(err <= 0))
+ return err;
+
+ tcp_enter_cwr(sk);
+
+ return net_xmit_eval(err);
+}
+
+/* This routine just queues the buffer for sending.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Advance write_seq and place onto the write_queue. */
+ tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+ __skb_header_release(skb);
+ tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+}
+
+/* Initialize TSO segments for a packet. */
+static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
+ if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
+ /* Avoid the costly divide in the normal
+ * non-TSO case.
+ */
+ tcp_skb_pcount_set(skb, 1);
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
+ } else {
+ tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
+ shinfo->gso_size = mss_now;
+ shinfo->gso_type = sk->sk_gso_type;
+ }
+}
+
+/* When a modification to fackets out becomes necessary, we need to check
+ * skb is counted to fackets_out or not.
+ */
+static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
+ int decr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->sacked_out || tcp_is_reno(tp))
+ return;
+
+ if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
+ tp->fackets_out -= decr;
+}
+
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->packets_out -= decr;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ tp->lost_out -= decr;
+
+ /* Reno case is special. Sigh... */
+ if (tcp_is_reno(tp) && decr > 0)
+ tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+ tcp_adjust_fackets_out(sk, skb, decr);
+
+ if (tp->lost_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+ (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+ tp->lost_cnt_hint -= decr;
+
+ tcp_verify_left_out(tp);
+}
+
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+ !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+ struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+ u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+ shinfo->tx_flags &= ~tsflags;
+ shinfo2->tx_flags |= tsflags;
+ swap(shinfo->tskey, shinfo2->tskey);
+ }
+}
+
+/* Function to create two new TCP segments. Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list. This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
+ */
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+ unsigned int mss_now, gfp_t gfp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+ int nsize, old_factor;
+ int nlen;
+ u8 flags;
+
+ if (WARN_ON(len > skb->len))
+ return -EINVAL;
+
+ nsize = skb_headlen(skb) - len;
+ if (nsize < 0)
+ nsize = 0;
+
+ if (skb_unclone(skb, gfp))
+ return -ENOMEM;
+
+ /* Get a new skb... force flag on. */
+ buff = sk_stream_alloc_skb(sk, nsize, gfp);
+ if (!buff)
+ return -ENOMEM; /* We'll just try again later. */
+
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
+ nlen = skb->len - len - nsize;
+ buff->truesize += nlen;
+ skb->truesize -= nlen;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
+ TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
+ /* Copy and checksum data tail into the new buffer. */
+ buff->csum = csum_partial_copy_nocheck(skb->data + len,
+ skb_put(buff, nsize),
+ nsize, 0);
+
+ skb_trim(skb, len);
+
+ skb->csum = csum_block_sub(skb->csum, buff->csum, len);
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_split(skb, buff, len);
+ }
+
+ buff->ip_summed = skb->ip_summed;
+
+ buff->tstamp = skb->tstamp;
+ tcp_fragment_tstamp(skb, buff);
+
+ old_factor = tcp_skb_pcount(skb);
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+ /* If this packet has been sent out already, we must
+ * adjust the various packet counters.
+ */
+ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+ int diff = old_factor - tcp_skb_pcount(skb) -
+ tcp_skb_pcount(buff);
+
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
+ }
+
+ /* Link BUFF into the send queue. */
+ __skb_header_release(buff);
+ tcp_insert_write_queue_after(skb, buff, sk);
+
+ return 0;
+}
+
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+static void __pskb_trim_head(struct sk_buff *skb, int len)
+{
+ struct skb_shared_info *shinfo;
+ int i, k, eat;
+
+ eat = min_t(int, len, skb_headlen(skb));
+ if (eat) {
+ __skb_pull(skb, eat);
+ len -= eat;
+ if (!len)
+ return;
+ }
+ eat = len;
+ k = 0;
+ shinfo = skb_shinfo(skb);
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ int size = skb_frag_size(&shinfo->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
+ } else {
+ shinfo->frags[k] = shinfo->frags[i];
+ if (eat) {
+ shinfo->frags[k].page_offset += eat;
+ skb_frag_size_sub(&shinfo->frags[k], eat);
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ shinfo->nr_frags = k;
+
+ skb_reset_tail_pointer(skb);
+ skb->data_len -= len;
+ skb->len = skb->data_len;
+}
+
+/* Remove acked data from a packet in the transmit queue. */
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
+
+ __pskb_trim_head(skb, len);
+
+ TCP_SKB_CB(skb)->seq += len;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb->truesize -= len;
+ sk->sk_wmem_queued -= len;
+ sk_mem_uncharge(sk, len);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+
+ /* Any change of skb->len requires recalculation of tso factor. */
+ if (tcp_skb_pcount(skb) > 1)
+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+
+ return 0;
+}
+
+/* Calculate MSS not accounting any TCP options. */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+ }
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->rx_opt.mss_clamp)
+ mss_now = tp->rx_opt.mss_clamp;
+
+ /* Now subtract optional transport overhead */
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+ return mss_now;
+}
+
+/* Calculate MSS. Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ /* Subtract TCP options size, not including SACKs */
+ return __tcp_mtu_to_mss(sk, pmtu) -
+ (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int mtu;
+
+ mtu = mss +
+ tp->tcp_header_len +
+ icsk->icsk_ext_hdr_len +
+ icsk->icsk_af_ops->net_header_len;
+
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mtu += icsk->icsk_af_ops->net_frag_header_len;
+ }
+ return mtu;
+}
+
+/* MTU probing init per socket */
+void tcp_mtup_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
+
+ icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
+ icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+}
+EXPORT_SYMBOL(tcp_mtup_init);
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+ tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+ for TCP options, but includes only bare TCP header.
+
+ tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+ It is minimum of user_mss and mss received with SYN.
+ It also does not include TCP options.
+
+ inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
+
+ tp->mss_cache is current effective sending mss, including
+ all tcp options except for SACKs. It is evaluated,
+ taking into account current pmtu, but never exceeds
+ tp->rx_opt.mss_clamp.
+
+ NOTE1. rfc1122 clearly states that advertised MSS
+ DOES NOT include either tcp or ip options.
+
+ NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+ are READ ONLY outside this function. --ANK (980731)
+ */
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int mss_now;
+
+ if (icsk->icsk_mtup.search_high > pmtu)
+ icsk->icsk_mtup.search_high = pmtu;
+
+ mss_now = tcp_mtu_to_mss(sk, pmtu);
+ mss_now = tcp_bound_to_half_wnd(tp, mss_now);
+
+ /* And store cached results */
+ icsk->icsk_pmtu_cookie = pmtu;
+ if (icsk->icsk_mtup.enabled)
+ mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
+ tp->mss_cache = mss_now;
+
+ return mss_now;
+}
+EXPORT_SYMBOL(tcp_sync_mss);
+
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ */
+unsigned int tcp_current_mss(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ u32 mss_now;
+ unsigned int header_len;
+ struct tcp_out_options opts;
+ struct tcp_md5sig_key *md5;
+
+ mss_now = tp->mss_cache;
+
+ if (dst) {
+ u32 mtu = dst_mtu(dst);
+ if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
+ mss_now = tcp_sync_mss(sk, mtu);
+ }
+
+ header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+ sizeof(struct tcphdr);
+ /* The mss_cache is sized based on tp->tcp_header_len, which assumes
+ * some common options. If this is an odd packet (because we have SACK
+ * blocks etc) then our calculated header_len will be different, and
+ * we have to adjust mss_now correspondingly */
+ if (header_len != tp->tcp_header_len) {
+ int delta = (int) header_len - tp->tcp_header_len;
+ mss_now -= delta;
+ }
+
+ return mss_now;
+}
+
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+static void tcp_cwnd_application_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ /* Limited by application or receiver window. */
+ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ u32 win_used = max(tp->snd_cwnd_used, init_win);
+ if (win_used < tp->snd_cwnd) {
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ }
+ tp->snd_cwnd_used = 0;
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Track the maximum number of outstanding packets in each
+ * window, and remember whether we were cwnd-limited then.
+ */
+ if (!before(tp->snd_una, tp->max_packets_seq) ||
+ tp->packets_out > tp->max_packets_out) {
+ tp->max_packets_out = tp->packets_out;
+ tp->max_packets_seq = tp->snd_nxt;
+ tp->is_cwnd_limited = is_cwnd_limited;
+ }
+
+ if (tcp_is_cwnd_limited(sk)) {
+ /* Network is feed fully. */
+ tp->snd_cwnd_used = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else {
+ /* Network starves. */
+ if (tp->packets_out > tp->snd_cwnd_used)
+ tp->snd_cwnd_used = tp->packets_out;
+
+ if (sysctl_tcp_slow_start_after_idle &&
+ (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
+ tcp_cwnd_application_limited(sk);
+ }
+}
+
+/* Minshall's variant of the Nagle send check. */
+static bool tcp_minshall_check(const struct tcp_sock *tp)
+{
+ return after(tp->snd_sml, tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Update snd_sml if this skb is under mss
+ * Note that a TSO packet might end with a sub-mss segment
+ * The test is really :
+ * if ((skb->len % mss) != 0)
+ * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ * skb_pcount = skb->len / mss_now
+ */
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+ const struct sk_buff *skb)
+{
+ if (skb->len < tcp_skb_pcount(skb) * mss_now)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+}
+
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
+ */
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+ int nonagle)
+{
+ return partial &&
+ ((nonagle & TCP_NAGLE_CORK) ||
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+
+/* Return how many segs we'd like on a TSO packet,
+ * to send one TSO packet per ms
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+{
+ u32 bytes, segs;
+
+ bytes = min(sk->sk_pacing_rate >> 10,
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+
+ return min_t(u32, segs, sk->sk_gso_max_segs);
+}
+
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+ const struct sk_buff *skb,
+ unsigned int mss_now,
+ unsigned int max_segs,
+ int nonagle)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 partial, needed, window, max_len;
+
+ window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+ max_len = mss_now * max_segs;
+
+ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+ return max_len;
+
+ needed = min(skb->len, window);
+
+ if (max_len <= needed)
+ return max_len;
+
+ partial = needed % mss_now;
+ /* If last segment is not a full MSS, check if Nagle rules allow us
+ * to include this last segment in this skb.
+ * Otherwise, we'll split the skb at last MSS boundary
+ */
+ if (tcp_nagle_check(partial != 0, tp, nonagle))
+ return needed - partial;
+
+ return needed;
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules? If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ u32 in_flight, cwnd, halfcwnd;
+
+ /* Don't be strict about the congestion window for the final FIN. */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+ tcp_skb_pcount(skb) == 1)
+ return 1;
+
+ in_flight = tcp_packets_in_flight(tp);
+ cwnd = tp->snd_cwnd;
+ if (in_flight >= cwnd)
+ return 0;
+
+ /* For better scheduling, ensure we have at least
+ * 2 GSO packets in flight.
+ */
+ halfcwnd = max(cwnd >> 1, 1U);
+ return min(halfcwnd, cwnd - in_flight);
+}
+
+/* Initialize TSO state of a skb.
+ * This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now)
+{
+ int tso_segs = tcp_skb_pcount(skb);
+
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tso_segs = tcp_skb_pcount(skb);
+ }
+ return tso_segs;
+}
+
+
+/* Return true if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ /* Nagle rule does not apply to frames, which sit in the middle of the
+ * write_queue (they have no chances to get new data).
+ *
+ * This is implemented in the callers, where they modify the 'nonagle'
+ * argument based upon the location of SKB in the send queue.
+ */
+ if (nonagle & TCP_NAGLE_PUSH)
+ return true;
+
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ return true;
+
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
+ return true;
+
+ return false;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned int cur_mss)
+{
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (skb->len > cur_mss)
+ end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+ return !after(end_seq, tcp_wnd_end(tp));
+}
+
+/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
+ * should be put on the wire right now. If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int cwnd_quota;
+
+ tcp_init_tso_segs(sk, skb, cur_mss);
+
+ if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+ return 0;
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
+ cwnd_quota = 0;
+
+ return cwnd_quota;
+}
+
+/* Test if sending is allowed right now. */
+bool tcp_may_send_now(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = tcp_send_head(sk);
+
+ return skb &&
+ tcp_snd_test(sk, skb, tcp_current_mss(sk),
+ (tcp_skb_is_last(sk, skb) ?
+ tp->nonagle : TCP_NAGLE_PUSH));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list. It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation. In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ unsigned int mss_now, gfp_t gfp)
+{
+ struct sk_buff *buff;
+ int nlen = skb->len - len;
+ u8 flags;
+
+ /* All of a TSO frame must be composed of paged data. */
+ if (skb->len != skb->data_len)
+ return tcp_fragment(sk, skb, len, mss_now, gfp);
+
+ buff = sk_stream_alloc_skb(sk, 0, gfp);
+ if (unlikely(!buff))
+ return -ENOMEM;
+
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
+ buff->truesize += nlen;
+ skb->truesize -= nlen;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
+
+ /* This packet was never sent out yet, so no SACK bits. */
+ TCP_SKB_CB(buff)->sacked = 0;
+
+ buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_split(skb, buff, len);
+ tcp_fragment_tstamp(skb, buff);
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+ /* Link BUFF into the send queue. */
+ __skb_header_release(buff);
+ tcp_insert_write_queue_after(skb, buff, sk);
+
+ return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do. View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ bool *is_cwnd_limited, u32 max_segs)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 age, send_win, cong_win, limit, in_flight;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ struct sk_buff *head;
+ int win_divisor;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ goto send_now;
+
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
+ goto send_now;
+
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent.
+ */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
+ goto send_now;
+
+ in_flight = tcp_packets_in_flight(tp);
+
+ BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
+
+ send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+ /* From in_flight test above, we know that cwnd > in_flight. */
+ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+ limit = min(send_win, cong_win);
+
+ /* If a full-sized TSO skb can be sent, do it. */
+ if (limit >= max_segs * tp->mss_cache)
+ goto send_now;
+
+ /* Middle in queue won't get any more data, full sendable already? */
+ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+ goto send_now;
+
+ win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+ if (win_divisor) {
+ u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+ /* If at least some fraction of a window is available,
+ * just use it.
+ */
+ chunk /= win_divisor;
+ if (limit >= chunk)
+ goto send_now;
+ } else {
+ /* Different approach, try not to defer past a single
+ * ACK. Receiver should ACK every other full sized
+ * frame, so if we have space for more than 3 frames
+ * then send now.
+ */
+ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
+ goto send_now;
+ }
+
+ head = tcp_write_queue_head(sk);
+ skb_mstamp_get(&now);
+ age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+ /* If next ACK is likely to come too late (half srtt), do not defer */
+ if (age < (tp->srtt_us >> 4))
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer. */
+
+ if (cong_win < send_win && cong_win < skb->len)
+ *is_cwnd_limited = true;
+
+ return true;
+
+send_now:
+ return false;
+}
+
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
+/* Create a new MTU probe if we are ready.
+ * MTU probe is regularly attempting to increase the path MTU by
+ * deliberately sending larger packets. This discovers routing
+ * changes resulting in larger path MTUs.
+ *
+ * Returns 0 if we should wait to probe (no cwnd available),
+ * 1 if a probe was sent,
+ * -1 otherwise
+ */
+static int tcp_mtu_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
+ int len;
+ int probe_size;
+ int size_needed;
+ int copy;
+ int mss_now;
+ int interval;
+
+ /* Not currently probing/verifying,
+ * not in recovery,
+ * have enough cwnd, and
+ * not SACKing (the variable headers throw things off) */
+ if (!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ return -1;
+
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
+ mss_now = tcp_current_mss(sk);
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
+ size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
+ return -1;
+ }
+
+ /* Have enough data in the send queue to probe? */
+ if (tp->write_seq - tp->snd_nxt < size_needed)
+ return -1;
+
+ if (tp->snd_wnd < size_needed)
+ return -1;
+ if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
+ return 0;
+
+ /* Do we need to wait to drain cwnd? With none in flight, don't stall */
+ if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
+ if (!tcp_packets_in_flight(tp))
+ return -1;
+ else
+ return 0;
+ }
+
+ /* We're allowed to probe. Build it now. */
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+ if (!nskb)
+ return -1;
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
+
+ skb = tcp_send_head(sk);
+
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+ TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
+ TCP_SKB_CB(nskb)->sacked = 0;
+ nskb->csum = 0;
+ nskb->ip_summed = skb->ip_summed;
+
+ tcp_insert_write_queue_before(nskb, skb, sk);
+
+ len = 0;
+ tcp_for_write_queue_from_safe(skb, next, sk) {
+ copy = min_t(int, skb->len, probe_size - len);
+ if (nskb->ip_summed)
+ skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+ else
+ nskb->csum = skb_copy_and_csum_bits(skb, 0,
+ skb_put(nskb, copy),
+ copy, nskb->csum);
+
+ if (skb->len <= copy) {
+ /* We've eaten all the data from this skb.
+ * Throw it away. */
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ } else {
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
+ ~(TCPHDR_FIN|TCPHDR_PSH);
+ if (!skb_shinfo(skb)->nr_frags) {
+ skb_pull(skb, copy);
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->csum = csum_partial(skb->data,
+ skb->len, 0);
+ } else {
+ __pskb_trim_head(skb, copy);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ }
+ TCP_SKB_CB(skb)->seq += copy;
+ }
+
+ len += copy;
+
+ if (len >= probe_size)
+ break;
+ }
+ tcp_init_tso_segs(sk, nskb, nskb->len);
+
+ /* We're ready to send. If this fails, the probe will
+ * be resegmented into mss-sized pieces by tcp_write_xmit().
+ */
+ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+ /* Decrement cwnd here because we are sending
+ * effectively two packets. */
+ tp->snd_cwnd--;
+ tcp_event_new_data_sent(sk, nskb);
+
+ icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+ tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+ tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+ return 1;
+ }
+
+ return -1;
+}
+
+/* This routine writes packets to the network. It advances the
+ * send_head. This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames between
+ * snd_up-64k-mss .. snd_up cannot be large. However, taking into
+ * account rare use of URG, this is not a big flaw.
+ *
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
+ */
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int tso_segs, sent_pkts;
+ int cwnd_quota;
+ int result;
+ bool is_cwnd_limited = false;
+ u32 max_segs;
+
+ sent_pkts = 0;
+
+ if (!push_one) {
+ /* Do MTU probing. */
+ result = tcp_mtu_probe(sk);
+ if (!result) {
+ return false;
+ } else if (result > 0) {
+ sent_pkts = 1;
+ }
+ }
+
+ max_segs = tcp_tso_autosize(sk, mss_now);
+ while ((skb = tcp_send_head(sk))) {
+ unsigned int limit;
+
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ BUG_ON(!tso_segs);
+
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp" is used as a start point for the retransmit timer */
+ skb_mstamp_get(&skb->skb_mstamp);
+ goto repair; /* Skip network transmission */
+ }
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (!cwnd_quota) {
+ is_cwnd_limited = true;
+ if (push_one == 2)
+ /* Force out a loss probe pkt. */
+ cwnd_quota = 1;
+ else
+ break;
+ }
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ break;
+
+ if (tso_segs == 1 || !max_segs) {
+ if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+ (tcp_skb_is_last(sk, skb) ?
+ nonagle : TCP_NAGLE_PUSH))))
+ break;
+ } else {
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+ max_segs))
+ break;
+ }
+
+ limit = mss_now;
+ if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ min_t(unsigned int,
+ cwnd_quota,
+ max_segs),
+ nonagle);
+
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ break;
+
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ break;
+ }
+
+ if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+ break;
+
+repair:
+ /* Advance the send_head. This one is sent out.
+ * This call will increment packets_out.
+ */
+ tcp_event_new_data_sent(sk, skb);
+
+ tcp_minshall_update(tp, mss_now, skb);
+ sent_pkts += tcp_skb_pcount(skb);
+
+ if (push_one)
+ break;
+ }
+
+ if (likely(sent_pkts)) {
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += sent_pkts;
+
+ /* Send one loss probe per tail loss episode. */
+ if (push_one != 2)
+ tcp_schedule_loss_probe(sk);
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+ return false;
+ }
+ return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, tlp_time_stamp, rto_time_stamp;
+ u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
+
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+ return false;
+ /* No consecutive loss probes. */
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+ tcp_rearm_rto(sk);
+ return false;
+ }
+ /* Don't do any loss probe on a Fast Open connection before 3WHS
+ * finishes.
+ */
+ if (sk->sk_state == TCP_SYN_RECV)
+ return false;
+
+ /* TLP is only scheduled when next timer event is RTO. */
+ if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+ return false;
+
+ /* Schedule a loss probe in 2*RTT for SACK capable connections
+ * in Open state, that are either limited by cwnd or application.
+ */
+ if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
+ !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ return false;
+
+ if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+ tcp_send_head(sk))
+ return false;
+
+ /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ * for delayed ack when there's one outstanding packet.
+ */
+ timeout = rtt << 1;
+ if (tp->packets_out == 1)
+ timeout = max_t(u32, timeout,
+ (rtt + (rtt >> 1) + TCP_DELACK_MAX));
+ timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+
+ /* If RTO is shorter, just schedule TLP in its place. */
+ tlp_time_stamp = tcp_time_stamp + timeout;
+ rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+ if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+ s32 delta = rto_time_stamp - tcp_time_stamp;
+ if (delta > 0)
+ timeout = delta;
+ }
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* Thanks to skb fast clones, we can detect if a prior transmit of
+ * a packet is still in a qdisc or driver queue.
+ * In this case, there is very little point doing a retransmit !
+ * Note: This is called from BH context only.
+ */
+static bool skb_still_in_host_queue(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ if (unlikely(skb_fclone_busy(sk, skb))) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
+ return false;
+}
+
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int pcount;
+ int mss = tcp_current_mss(sk);
+ int err = -1;
+
+ if (tcp_send_head(sk)) {
+ err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ goto rearm_timer;
+ }
+
+ /* At most one outstanding TLP retransmission. */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ /* Retransmit last segment. */
+ skb = tcp_write_queue_tail(sk);
+ if (WARN_ON(!skb))
+ goto rearm_timer;
+
+ if (skb_still_in_host_queue(sk, skb))
+ goto rearm_timer;
+
+ pcount = tcp_skb_pcount(skb);
+ if (WARN_ON(!pcount))
+ goto rearm_timer;
+
+ if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+ if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+ GFP_ATOMIC)))
+ goto rearm_timer;
+ skb = tcp_write_queue_tail(sk);
+ }
+
+ if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ goto rearm_timer;
+
+ err = __tcp_retransmit_skb(sk, skb);
+
+ /* Record snd_nxt for loss detection. */
+ if (likely(!err))
+ tp->tlp_high_seq = tp->snd_nxt;
+
+rearm_timer:
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+
+ if (likely(!err))
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBES);
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ int nonagle)
+{
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and
+ * all will be happy.
+ */
+ if (unlikely(sk->sk_state == TCP_CLOSE))
+ return;
+
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_atomic(sk, GFP_ATOMIC)))
+ tcp_check_probe_timer(sk);
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+ struct sk_buff *skb = tcp_send_head(sk);
+
+ BUG_ON(!skb || skb->len < mss_now);
+
+ tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ * RECV.NEXT + RCV.WIN fixed until:
+ * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ *
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ *
+ * BSD seems to make the following compromise:
+ *
+ * If the free space is less than the 1/4 of the maximum
+ * space available and the free space is less than 1/2 mss,
+ * then set the window to 0.
+ * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ * Otherwise, just prevent the window from shrinking
+ * and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ /* MSS for the peer's data. Previous versions used mss_clamp
+ * here. I don't know if the value based on our guesses
+ * of peer's MSS is better for the performance. It's more correct
+ * but may be worse for the performance because of rcv_mss
+ * fluctuations. --SAW 1998/11/1
+ */
+ int mss = icsk->icsk_ack.rcv_mss;
+ int free_space = tcp_space(sk);
+ int allowed_space = tcp_full_space(sk);
+ int full_space = min_t(int, tp->window_clamp, allowed_space);
+ int window;
+
+ if (mss > full_space)
+ mss = full_space;
+
+ if (free_space < (full_space >> 1)) {
+ icsk->icsk_ack.quick = 0;
+
+ if (sk_under_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh,
+ 4U * tp->advmss);
+
+ /* free_space might become our new window, make sure we don't
+ * increase it due to wscale.
+ */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ /* if free space is less than mss estimate, or is below 1/16th
+ * of the maximum allowed, try to move to zero-window, else
+ * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+ * new incoming data is dropped due to memory limits.
+ * With large window, mss test triggers way too late in order
+ * to announce zero window in time before rmem limit kicks in.
+ */
+ if (free_space < (allowed_space >> 4) || free_space < mss)
+ return 0;
+ }
+
+ if (free_space > tp->rcv_ssthresh)
+ free_space = tp->rcv_ssthresh;
+
+ /* Don't do rounding if we are using window scaling, since the
+ * scaled window will not line up with the MSS boundary anyway.
+ */
+ window = tp->rcv_wnd;
+ if (tp->rx_opt.rcv_wscale) {
+ window = free_space;
+
+ /* Advertise enough space so that it won't get scaled away.
+ * Import case: prevent zero window announcement if
+ * 1<<rcv_wscale > mss.
+ */
+ if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
+ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+ << tp->rx_opt.rcv_wscale);
+ } else {
+ /* Get the largest window that is a nice multiple of mss.
+ * Window clamp already applied above.
+ * If our current window offering is within 1 mss of the
+ * free space we just keep it. This prevents the divide
+ * and multiply from happening most of the time.
+ * We also don't do any window rounding when the free space
+ * is too small.
+ */
+ if (window <= free_space - mss || window > free_space)
+ window = (free_space / mss) * mss;
+ else if (mss == full_space &&
+ free_space > window + (full_space >> 1))
+ window = free_space;
+ }
+
+ return window;
+}
+
+/* Collapses two adjacent SKB's during retransmission. */
+static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+ int skb_size, next_skb_size;
+
+ skb_size = skb->len;
+ next_skb_size = next_skb->len;
+
+ BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+
+ tcp_highest_sack_combine(sk, next_skb, skb);
+
+ tcp_unlink_write_queue(next_skb, sk);
+
+ skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
+ next_skb_size);
+
+ if (next_skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+
+ /* Update sequence range on original skb. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ /* Merge over control information. This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
+
+ /* All done, get rid of second SKB and account for it so
+ * packet counting does not break.
+ */
+ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+
+ /* changed transmit queue under us so clear hints */
+ tcp_clear_retrans_hints_partial(tp);
+ if (next_skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = skb;
+
+ tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
+ sk_wmem_free_skb(sk, next_skb);
+}
+
+/* Check if coalescing SKBs is legal. */
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
+{
+ if (tcp_skb_pcount(skb) > 1)
+ return false;
+ /* TODO: SACK collapsing could be used to remove this condition */
+ if (skb_shinfo(skb)->nr_frags != 0)
+ return false;
+ if (skb_cloned(skb))
+ return false;
+ if (skb == tcp_send_head(sk))
+ return false;
+ /* Some heurestics for collapsing over SACK'd could be invented */
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ return false;
+
+ return true;
+}
+
+/* Collapse packets in the retransmit queue to make to create
+ * less packets on the wire. This is only done on retransmission.
+ */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
+ int space)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = to, *tmp;
+ bool first = true;
+
+ if (!sysctl_tcp_retrans_collapse)
+ return;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ return;
+
+ tcp_for_write_queue_from_safe(skb, tmp, sk) {
+ if (!tcp_can_collapse(sk, skb))
+ break;
+
+ space -= skb->len;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space < 0)
+ break;
+ /* Punt if not enough space exists in the first SKB for
+ * the data in the second
+ */
+ if (skb->len > skb_availroom(to))
+ break;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
+ break;
+
+ tcp_collapse_retrans(sk, to);
+ }
+}
+
+/* This retransmits one SKB. Policy decisions and retransmit queue
+ * state updates are done by the caller. Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned int cur_mss;
+ int err;
+
+ /* Inconslusive MTU probe */
+ if (icsk->icsk_mtup.probe_size) {
+ icsk->icsk_mtup.probe_size = 0;
+ }
+
+ /* Do not sent more than we queued. 1/4 is reserved for possible
+ * copying overhead: fragmentation, tunneling, mangling etc.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) >
+ min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ return -EAGAIN;
+
+ if (skb_still_in_host_queue(sk, skb))
+ return -EBUSY;
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+ if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ BUG();
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+ return -ENOMEM;
+ }
+
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ cur_mss = tcp_current_mss(sk);
+
+ /* If receiver has shrunk his window, and skb is out of
+ * new window, do not retransmit it. The exception is the
+ * case, when window is shrunk to zero. In this case
+ * our retransmit serves as a zero window probe.
+ */
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
+ TCP_SKB_CB(skb)->seq != tp->snd_una)
+ return -EAGAIN;
+
+ if (skb->len > cur_mss) {
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
+ return -ENOMEM; /* We'll try again later. */
+ } else {
+ int oldpcount = tcp_skb_pcount(skb);
+
+ if (unlikely(oldpcount > 1)) {
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
+ tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
+ }
+ }
+
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
+
+ /* Make a copy, if the first transmission SKB clone we made
+ * is still in somebody's hands, else make a clone.
+ */
+
+ /* make sure skb->data is aligned on arches that require it
+ * and check if ack-trimming & collapsing extended the headroom
+ * beyond what csum_start can cover.
+ */
+ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
+ skb_headroom(skb) >= 0xFFFF)) {
+ struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } else {
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
+
+ if (likely(!err)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+ /* Update global TCP statistics. */
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans++;
+ }
+ return err;
+}
+
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = __tcp_retransmit_skb(sk, skb);
+
+ if (err == 0) {
+#if FASTRETRANS_DEBUG > 0
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ net_dbg_ratelimited("retrans_out leaked\n");
+ }
+#endif
+ if (!tp->retrans_out)
+ tp->lost_retrans_low = tp->snd_nxt;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+ tp->retrans_out += tcp_skb_pcount(skb);
+
+ /* Save stamp of the first retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_skb_timestamp(skb);
+
+ /* snd_nxt is stored to detect loss of retransmitted segment,
+ * see tcp_input.c tcp_sacktag_write_queue().
+ */
+ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ }
+
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
+ return err;
+}
+
+/* Check if we forward retransmits are possible in the current
+ * window/congestion state.
+ */
+static bool tcp_can_forward_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Forward retransmissions are possible only during Recovery. */
+ if (icsk->icsk_ca_state != TCP_CA_Recovery)
+ return false;
+
+ /* No forward retransmissions in Reno are possible. */
+ if (tcp_is_reno(tp))
+ return false;
+
+ /* Yeah, we have to make difficult choice between forward transmission
+ * and retransmission... Both ways have their merits...
+ *
+ * For now we do not retransmit anything, while we have some new
+ * segments to send. In the other cases, follow rule 3 for
+ * NextSeg() specified in RFC3517.
+ */
+
+ if (tcp_may_send_now(sk))
+ return false;
+
+ return true;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged. It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ struct sk_buff *hole = NULL;
+ u32 last_lost;
+ int mib_idx;
+ int fwd_rexmitting = 0;
+
+ if (!tp->packets_out)
+ return;
+
+ if (!tp->lost_out)
+ tp->retransmit_high = tp->snd_una;
+
+ if (tp->retransmit_skb_hint) {
+ skb = tp->retransmit_skb_hint;
+ last_lost = TCP_SKB_CB(skb)->end_seq;
+ if (after(last_lost, tp->retransmit_high))
+ last_lost = tp->retransmit_high;
+ } else {
+ skb = tcp_write_queue_head(sk);
+ last_lost = tp->snd_una;
+ }
+
+ tcp_for_write_queue_from(skb, sk) {
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (skb == tcp_send_head(sk))
+ break;
+ /* we could do better than to assign each time */
+ if (!hole)
+ tp->retransmit_skb_hint = skb;
+
+ /* Assume this retransmit will generate
+ * only one packet for congestion window
+ * calculation purposes. This works because
+ * tcp_retransmit_skb() will chop up the
+ * packet to be MSS sized and all the
+ * packet counting works out.
+ */
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ return;
+
+ if (fwd_rexmitting) {
+begin_fwd:
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+ break;
+ mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
+
+ } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+ tp->retransmit_high = last_lost;
+ if (!tcp_can_forward_retransmit(sk))
+ break;
+ /* Backtrack if necessary to non-L'ed skb */
+ if (hole) {
+ skb = hole;
+ hole = NULL;
+ }
+ fwd_rexmitting = 1;
+ goto begin_fwd;
+
+ } else if (!(sacked & TCPCB_LOST)) {
+ if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+ hole = skb;
+ continue;
+
+ } else {
+ last_lost = TCP_SKB_CB(skb)->end_seq;
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ mib_idx = LINUX_MIB_TCPFASTRETRANS;
+ else
+ mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+ }
+
+ if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+ continue;
+
+ if (tcp_retransmit_skb(sk, skb))
+ return;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += tcp_skb_pcount(skb);
+
+ if (skb == tcp_write_queue_head(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+ }
+}
+
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
+ */
+ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
+ tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
+ } else {
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
+ }
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+ tcp_init_nondata_skb(skb, tp->write_seq,
+ TCPHDR_ACK | TCPHDR_FIN);
+ tcp_queue_skb(sk, skb);
+ }
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue. This behavior is recommended
+ * by RFC 2525, section 2.17. -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+{
+ struct sk_buff *skb;
+
+ /* NOTE: No TCP options attached and we never retransmit this. */
+ skb = alloc_skb(MAX_TCP_HEADER, priority);
+ if (!skb) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+ TCPHDR_ACK | TCPHDR_RST);
+ /* Send it off. */
+ if (tcp_transmit_skb(sk, skb, 0, priority))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+}
+
+/* Send a crossed SYN-ACK during socket establishment.
+ * WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ skb = tcp_write_queue_head(sk);
+ if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_debug("%s: wrong queue state\n", __func__);
+ return -EFAULT;
+ }
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
+ if (skb_cloned(skb)) {
+ struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+ tcp_unlink_write_queue(skb, sk);
+ __skb_header_release(nskb);
+ __tcp_add_write_queue_head(sk, nskb);
+ sk_wmem_free_skb(sk, skb);
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
+ skb = nskb;
+ }
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
+ tcp_ecn_send_synack(sk, skb);
+ }
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+}
+
+/**
+ * tcp_make_synack - Prepare a SYN-ACK.
+ * sk: listener socket
+ * dst: dst entry attached to the SYNACK
+ * req: request_sock pointer
+ *
+ * Allocate one skb and build a SYNACK packet.
+ * @dst is consumed : Caller should not use it again.
+ */
+struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_out_options opts;
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th;
+ struct sk_buff *skb;
+ struct tcp_md5sig_key *md5 = NULL;
+ int tcp_header_size;
+ int mss;
+
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ dst_release(dst);
+ return NULL;
+ }
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+
+ skb_dst_set(skb, dst);
+
+ mss = dst_metric_advmss(dst);
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ memset(&opts, 0, sizeof(opts));
+#ifdef CONFIG_SYN_COOKIES
+ if (unlikely(req->cookie_ts))
+ skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
+ else
+#endif
+ skb_mstamp_get(&skb->skb_mstamp);
+
+#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
+ md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+ foc) + sizeof(*th);
+
+ skb_push(skb, tcp_header_size);
+ skb_reset_transport_header(skb);
+
+ th = tcp_hdr(skb);
+ memset(th, 0, sizeof(struct tcphdr));
+ th->syn = 1;
+ th->ack = 1;
+ tcp_ecn_make_synack(req, th, sk);
+ th->source = htons(ireq->ir_num);
+ th->dest = ireq->ir_rmt_port;
+ /* Setting of flags are superfluous here for callers (and ECE is
+ * not even correctly set)
+ */
+ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
+ TCPHDR_SYN | TCPHDR_ACK);
+
+ th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ /* XXX data is queued and acked as is. No buffer/window check */
+ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+ th->window = htons(min(req->rcv_wnd, 65535U));
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ th->doff = (tcp_header_size >> 2);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Okay, we have all we need - do the md5 hash if needed */
+ if (md5)
+ tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
+ md5, req_to_sk(req), skb);
+ rcu_read_unlock();
+#endif
+
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
+ return skb;
+}
+EXPORT_SYMBOL(tcp_make_synack);
+
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+ if (ca_key == TCP_CA_UNSPEC)
+ return;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ module_put(icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ }
+ rcu_read_unlock();
+}
+
+/* Do all connect socket setups that can be done AF independent. */
+static void tcp_connect_init(struct sock *sk)
+{
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+
+ /* We'll fix this up when we get a response from the other end.
+ * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+ */
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (tp->af_specific->md5_lookup(sk, sk))
+ tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+ /* If user gave his TCP_MAXSEG, record it to clamp */
+ if (tp->rx_opt.user_mss)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->max_window = 0;
+ tcp_mtup_init(sk);
+ tcp_sync_mss(sk, dst_mtu(dst));
+
+ tcp_ca_dst_init(sk, dst);
+
+ if (!tp->window_clamp)
+ tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ tp->advmss = dst_metric_advmss(dst);
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
+ tp->advmss = tp->rx_opt.user_mss;
+
+ tcp_initialize_rcv_mss(sk);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+ tp->window_clamp = tcp_full_space(sk);
+
+ tcp_select_initial_window(tcp_full_space(sk),
+ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ &tp->rcv_wnd,
+ &tp->window_clamp,
+ sysctl_tcp_window_scaling,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rcv_ssthresh = tp->rcv_wnd;
+
+ sk->sk_err = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->snd_wnd = 0;
+ tcp_init_wl(tp, 0);
+ tp->snd_una = tp->write_seq;
+ tp->snd_sml = tp->write_seq;
+ tp->snd_up = tp->write_seq;
+ tp->snd_nxt = tp->write_seq;
+
+ if (likely(!tp->repair))
+ tp->rcv_nxt = 0;
+ else
+ tp->rcv_tstamp = tcp_time_stamp;
+ tp->rcv_wup = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
+
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_retransmits = 0;
+ tcp_clear_retrans(tp);
+}
+
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->end_seq += skb->len;
+ __skb_header_release(skb);
+ __tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ tp->write_seq = tcb->end_seq;
+ tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_request *fo = tp->fastopen_req;
+ int syn_loss = 0, space, err = 0, copied;
+ unsigned long last_syn_loss = 0;
+ struct sk_buff *syn_data;
+
+ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
+ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+ &syn_loss, &last_syn_loss);
+ /* Recurring FO SYN losses: revert to regular handshake temporarily */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ fo->cookie.len = -1;
+ goto fallback;
+ }
+
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+ fo->cookie.len = -1;
+ else if (fo->cookie.len <= 0)
+ goto fallback;
+
+ /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+ * user-MSS. Reserve maximum option space for middleboxes that add
+ * private TCP options. The cost is reduced data space in SYN :(
+ */
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ MAX_TCP_OPTION_SPACE;
+
+ space = min_t(size_t, space, fo->size);
+
+ /* limit to order-0 allocations */
+ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
+
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ if (!syn_data)
+ goto fallback;
+ syn_data->ip_summed = CHECKSUM_PARTIAL;
+ memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+ copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
+
+ /* No more data pending in inet_wait_for_connect() */
+ if (space == fo->size)
+ fo->data = NULL;
+ fo->copied = space;
+
+ tcp_connect_queue_skb(sk, syn_data);
+
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
+
+ syn->skb_mstamp = syn_data->skb_mstamp;
+
+ /* Now full SYN+DATA was cloned and sent (or not),
+ * remove the SYN from the original skb (syn_data)
+ * we keep in write queue in case of a retransmit, as we
+ * also have the SYN packet (with no data) in the same queue.
+ */
+ TCP_SKB_CB(syn_data)->seq++;
+ TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+ if (!err) {
+ tp->syn_data = (fo->copied > 0);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
+ goto done;
+ }
+
+fallback:
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+done:
+ fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
+ return err;
+}
+
+/* Build a SYN and send it off. */
+int tcp_connect(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+ int err;
+
+ tcp_connect_init(sk);
+
+ if (unlikely(tp->repair)) {
+ tcp_finish_connect(sk, NULL);
+ return 0;
+ }
+
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (unlikely(!buff))
+ return -ENOBUFS;
+
+ tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+#ifdef CONFIG_TCP_STEALTH
+ /* The timetamp was already made at the time the ISN was generated
+ * as we need to know its value in the stealth_tcp_sequence_number()
+ * function.
+ */
+ tp->retrans_stamp = tp->stealth.mstamp.stamp_jiffies;
+#else
+ tp->retrans_stamp = tcp_time_stamp;
+#endif
+ tcp_connect_queue_skb(sk, buff);
+ tcp_ecn_send_syn(sk, buff);
+
+ /* Send off SYN; include data in Fast Open. */
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ if (err == -ECONNREFUSED)
+ return err;
+
+ /* We change tp->snd_nxt after the tcp_transmit_skb() call
+ * in order to make this packet get counted in tcpOutSegs.
+ */
+ tp->snd_nxt = tp->write_seq;
+ tp->pushed_seq = tp->write_seq;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
+
+ /* Timer for repeating the SYN until an answer. */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_connect);
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int ato = icsk->icsk_ack.ato;
+ unsigned long timeout;
+
+ tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
+ if (ato > TCP_DELACK_MIN) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int max_ato = HZ / 2;
+
+ if (icsk->icsk_ack.pingpong ||
+ (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
+ max_ato = TCP_DELACK_MAX;
+
+ /* Slow path, intersegment interval is "high". */
+
+ /* If some rtt estimate is known, use it to bound delayed ack.
+ * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
+ * directly.
+ */
+ if (tp->srtt_us) {
+ int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+ TCP_DELACK_MIN);
+
+ if (rtt < max_ato)
+ max_ato = rtt;
+ }
+
+ ato = min(ato, max_ato);
+ }
+
+ /* Stay within the limit we were given */
+ timeout = jiffies + ato;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ */
+ if (icsk->icsk_ack.blocked ||
+ time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ tcp_send_ack(sk);
+ return;
+ }
+
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
+ }
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+ struct sk_buff *buff;
+
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state == TCP_CLOSE)
+ return;
+
+ tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
+ /* We are not putting this on the write queue, so
+ * tcp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+ if (!buff) {
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
+
+ /* We do not want pure acks influencing TCP Small Queues or fq/pacing
+ * too much.
+ * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
+ * We also avoid tcp_wfree() overhead (cache line miss accessing
+ * tp->tsq_flags) by using regular sock_wfree()
+ */
+ skb_set_tcp_pure_ack(buff);
+
+ /* Send it off, this clears delayed acks for us. */
+ skb_mstamp_get(&buff->skb_mstamp);
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+}
+EXPORT_SYMBOL_GPL(tcp_send_ack);
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ /* We don't queue it, tcp_transmit_skb() sets ownership. */
+ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+ if (!skb)
+ return -1;
+
+ /* Reserve space for headers and set control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ /* Use a previous sequence. This should cause the other
+ * end to send an ack. Don't queue or clone SKB, just
+ * send it.
+ */
+ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+ skb_mstamp_get(&skb->skb_mstamp);
+ return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+}
+
+void tcp_send_window_probe(struct sock *sk)
+{
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
+/* Initiate keepalive or window probe from timer. */
+int tcp_write_wakeup(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (sk->sk_state == TCP_CLOSE)
+ return -1;
+
+ skb = tcp_send_head(sk);
+ if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ int err;
+ unsigned int mss = tcp_current_mss(sk);
+ unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+ if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+ /* We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS avoidance ( sender )
+ */
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ skb->len > mss) {
+ seg_size = min(seg_size, mss);
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+ return -1;
+ } else if (!tcp_skb_pcount(skb))
+ tcp_set_skb_tso_segs(sk, skb, mss);
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ if (!err)
+ tcp_event_new_data_sent(sk, skb);
+ return err;
+ } else {
+ if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+ tcp_xmit_probe_skb(sk, 1);
+ return tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
+/* A window probe timeout has occurred. If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long probe_max;
+ int err;
+
+ err = tcp_write_wakeup(sk);
+
+ if (tp->packets_out || !tcp_send_head(sk)) {
+ /* Cancel probe timer, if it is not required. */
+ icsk->icsk_probes_out = 0;
+ icsk->icsk_backoff = 0;
+ return;
+ }
+
+ if (err <= 0) {
+ if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ icsk->icsk_backoff++;
+ icsk->icsk_probes_out++;
+ probe_max = TCP_RTO_MAX;
+ } else {
+ /* If packet was not sent due to local congestion,
+ * do not backoff and do not remember icsk_probes_out.
+ * Let local senders to fight for local resources.
+ *
+ * Use accumulated backoff yet.
+ */
+ if (!icsk->icsk_probes_out)
+ icsk->icsk_probes_out = 1;
+ probe_max = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ inet_csk_rto_backoff(icsk, probe_max),
+ TCP_RTO_MAX);
+}
+
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+{
+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+ struct flowi fl;
+ int res;
+
+ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ if (!res) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ }
+ return res;
+}
+EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 000000000..ebf5ff575
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,300 @@
+/*
+ * tcpprobe - Observe the TCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <net/tcp.h>
+
+MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
+MODULE_DESCRIPTION("TCP cwnd snooper");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.1");
+
+static int port __read_mostly;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+static unsigned int bufsize __read_mostly = 4096;
+MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
+module_param(bufsize, uint, 0);
+
+static unsigned int fwmark __read_mostly;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
+static int full __read_mostly;
+MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
+module_param(full, int, 0);
+
+static const char procname[] = "tcpprobe";
+
+struct tcp_log {
+ ktime_t tstamp;
+ union {
+ struct sockaddr raw;
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } src, dst;
+ u16 length;
+ u32 snd_nxt;
+ u32 snd_una;
+ u32 snd_wnd;
+ u32 rcv_wnd;
+ u32 snd_cwnd;
+ u32 ssthresh;
+ u32 srtt;
+};
+
+static struct {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ ktime_t start;
+ u32 lastcwnd;
+
+ unsigned long head, tail;
+ struct tcp_log *log;
+} tcp_probe;
+
+static inline int tcp_probe_used(void)
+{
+ return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
+}
+
+static inline int tcp_probe_avail(void)
+{
+ return bufsize - tcp_probe_used() - 1;
+}
+
+#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
+ do { \
+ si4.sin_family = AF_INET; \
+ si4.sin_port = inet->inet_##mem##port; \
+ si4.sin_addr.s_addr = inet->inet_##mem##addr; \
+ } while (0) \
+
+/*
+ * Hook inserted to be called before each receive packet.
+ * Note: arguments must match tcp_rcv_established()!
+ */
+static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+
+ /* Only update if port or skb mark matches */
+ if (((port == 0 && fwmark == 0) ||
+ ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port ||
+ (fwmark > 0 && skb->mark == fwmark)) &&
+ (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
+
+ spin_lock(&tcp_probe.lock);
+ /* If log fills, just silently drop */
+ if (tcp_probe_avail() > 1) {
+ struct tcp_log *p = tcp_probe.log + tcp_probe.head;
+
+ p->tstamp = ktime_get();
+ switch (sk->sk_family) {
+ case AF_INET:
+ tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
+ tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
+ break;
+ case AF_INET6:
+ memset(&p->src.v6, 0, sizeof(p->src.v6));
+ memset(&p->dst.v6, 0, sizeof(p->dst.v6));
+#if IS_ENABLED(CONFIG_IPV6)
+ p->src.v6.sin6_family = AF_INET6;
+ p->src.v6.sin6_port = inet->inet_sport;
+ p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
+
+ p->dst.v6.sin6_family = AF_INET6;
+ p->dst.v6.sin6_port = inet->inet_dport;
+ p->dst.v6.sin6_addr = sk->sk_v6_daddr;
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ p->length = skb->len;
+ p->snd_nxt = tp->snd_nxt;
+ p->snd_una = tp->snd_una;
+ p->snd_cwnd = tp->snd_cwnd;
+ p->snd_wnd = tp->snd_wnd;
+ p->rcv_wnd = tp->rcv_wnd;
+ p->ssthresh = tcp_current_ssthresh(sk);
+ p->srtt = tp->srtt_us >> 3;
+
+ tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
+ }
+ tcp_probe.lastcwnd = tp->snd_cwnd;
+ spin_unlock(&tcp_probe.lock);
+
+ wake_up(&tcp_probe.wait);
+ }
+
+ jprobe_return();
+}
+
+static struct jprobe tcp_jprobe = {
+ .kp = {
+ .symbol_name = "tcp_rcv_established",
+ },
+ .entry = jtcp_rcv_established,
+};
+
+static int tcpprobe_open(struct inode *inode, struct file *file)
+{
+ /* Reset (empty) log */
+ spin_lock_bh(&tcp_probe.lock);
+ tcp_probe.head = tcp_probe.tail = 0;
+ tcp_probe.start = ktime_get();
+ spin_unlock_bh(&tcp_probe.lock);
+
+ return 0;
+}
+
+static int tcpprobe_sprint(char *tbuf, int n)
+{
+ const struct tcp_log *p
+ = tcp_probe.log + tcp_probe.tail;
+ struct timespec tv
+ = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
+
+ return scnprintf(tbuf, n,
+ "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
+ (unsigned long)tv.tv_sec,
+ (unsigned long)tv.tv_nsec,
+ &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
+ p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
+}
+
+static ssize_t tcpprobe_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int error = 0;
+ size_t cnt = 0;
+
+ if (!buf)
+ return -EINVAL;
+
+ while (cnt < len) {
+ char tbuf[256];
+ int width;
+
+ /* Wait for data in buffer */
+ error = wait_event_interruptible(tcp_probe.wait,
+ tcp_probe_used() > 0);
+ if (error)
+ break;
+
+ spin_lock_bh(&tcp_probe.lock);
+ if (tcp_probe.head == tcp_probe.tail) {
+ /* multiple readers race? */
+ spin_unlock_bh(&tcp_probe.lock);
+ continue;
+ }
+
+ width = tcpprobe_sprint(tbuf, sizeof(tbuf));
+
+ if (cnt + width < len)
+ tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
+
+ spin_unlock_bh(&tcp_probe.lock);
+
+ /* if record greater than space available
+ return partial buffer (so far) */
+ if (cnt + width >= len)
+ break;
+
+ if (copy_to_user(buf + cnt, tbuf, width))
+ return -EFAULT;
+ cnt += width;
+ }
+
+ return cnt == 0 ? error : cnt;
+}
+
+static const struct file_operations tcpprobe_fops = {
+ .owner = THIS_MODULE,
+ .open = tcpprobe_open,
+ .read = tcpprobe_read,
+ .llseek = noop_llseek,
+};
+
+static __init int tcpprobe_init(void)
+{
+ int ret = -ENOMEM;
+
+ /* Warning: if the function signature of tcp_rcv_established,
+ * has been changed, you also have to change the signature of
+ * jtcp_rcv_established, otherwise you end up right here!
+ */
+ BUILD_BUG_ON(__same_type(tcp_rcv_established,
+ jtcp_rcv_established) == 0);
+
+ init_waitqueue_head(&tcp_probe.wait);
+ spin_lock_init(&tcp_probe.lock);
+
+ if (bufsize == 0)
+ return -EINVAL;
+
+ bufsize = roundup_pow_of_two(bufsize);
+ tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
+ if (!tcp_probe.log)
+ goto err0;
+
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
+ goto err0;
+
+ ret = register_jprobe(&tcp_jprobe);
+ if (ret)
+ goto err1;
+
+ pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+ port, fwmark, bufsize);
+ return 0;
+ err1:
+ remove_proc_entry(procname, init_net.proc_net);
+ err0:
+ kfree(tcp_probe.log);
+ return ret;
+}
+module_init(tcpprobe_init);
+
+static __exit void tcpprobe_exit(void)
+{
+ remove_proc_entry(procname, init_net.proc_net);
+ unregister_jprobe(&tcp_jprobe);
+ kfree(tcp_probe.log);
+}
+module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000..333bcb241
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,62 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See http://www.deneholme.net/tom/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_MD_SCALE 3
+
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ 1);
+}
+
+static u32 tcp_scalable_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
+ .ssthresh = tcp_scalable_ssthresh,
+ .cong_avoid = tcp_scalable_cong_avoid,
+
+ .owner = THIS_MODULE,
+ .name = "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+ return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 000000000..8c65dc147
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,652 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
+int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
+int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
+int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
+int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
+int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_thin_linear_timeouts __read_mostly;
+
+static void tcp_write_err(struct sock *sk)
+{
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
+}
+
+/* Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criteria is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ * limit.
+ * 2. If we have strong memory pressure.
+ */
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int shift = 0;
+
+ /* If peer does not open window for long time, or did not transmit
+ * anything for long time, penalize it. */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+ shift++;
+
+ /* If some dubious ICMP arrived, penalize even more. */
+ if (sk->sk_err_soft)
+ shift++;
+
+ if (tcp_check_oom(sk, shift)) {
+ /* Catch exceptional cases, when connection requires reset.
+ * 1. Last segment was sent recently. */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+ /* 2. Window is closed. */
+ (!tp->snd_wnd && !tp->packets_out))
+ do_reset = true;
+ if (do_reset)
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate maximal number or retries on an orphaned socket. */
+static int tcp_orphan_retries(struct sock *sk, int alive)
+{
+ int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+
+ /* We know from an ICMP that something is wrong. */
+ if (sk->sk_err_soft && !alive)
+ retries = 0;
+
+ /* However, if socket sent something recently, select some safe
+ * number of retries. 8 corresponds to >100 seconds with minimal
+ * RTO of 200msec. */
+ if (retries == 0 && alive)
+ retries = 8;
+ return retries;
+}
+
+static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ /* Black hole detection */
+ if (net->ipv4.sysctl_tcp_mtu_probing) {
+ if (!icsk->icsk_mtup.enabled) {
+ icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ } else {
+ struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss;
+
+ mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+ mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
+ mss = max(mss, 68 - tp->tcp_header_len);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
+ }
+}
+
+/* This function calculates a "timeout" which is equivalent to the timeout of a
+ * TCP connection after "boundary" unsuccessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+ unsigned int boundary,
+ unsigned int timeout,
+ bool syn_set)
+{
+ unsigned int linear_backoff_thresh, start_ts;
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
+
+ if (!inet_csk(sk)->icsk_retransmits)
+ return false;
+
+ start_ts = tcp_sk(sk)->retrans_stamp;
+ if (unlikely(!start_ts))
+ start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+
+ if (likely(timeout == 0)) {
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ }
+ return (tcp_time_stamp - start_ts) >= timeout;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int retry_until;
+ bool do_reset, syn_set = false;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ if (icsk->icsk_retransmits) {
+ dst_negative_advice(sk);
+ if (tp->syn_fastopen || tp->syn_data)
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
+ if (tp->syn_data)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ syn_set = true;
+ } else {
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ /* Black hole detection */
+ tcp_mtu_probing(icsk, sk);
+
+ dst_negative_advice(sk);
+ }
+
+ retry_until = sysctl_tcp_retries2;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+
+ retry_until = tcp_orphan_retries(sk, alive);
+ do_reset = alive ||
+ !retransmits_timed_out(sk, retry_until, 0, 0);
+
+ if (tcp_out_of_resources(sk, do_reset))
+ return 1;
+ }
+ }
+
+ if (retransmits_timed_out(sk, retry_until,
+ syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
+ /* Has it gone just too far? */
+ tcp_write_err(sk);
+ return 1;
+ }
+ return 0;
+}
+
+void tcp_delack_timer_handler(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_mem_reclaim_partial(sk);
+
+ if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ goto out;
+
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+ goto out;
+ }
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
+ struct sk_buff *skb;
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
+
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk_backlog_rcv(sk, skb);
+
+ tp->ucopy.memory = 0;
+ }
+
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
+ tcp_send_ack(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ }
+
+out:
+ if (sk_under_memory_pressure(sk))
+ sk_mem_reclaim(sk);
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_delack_timer_handler(sk);
+ } else {
+ inet_csk(sk)->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static void tcp_probe_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int max_probes;
+ u32 start_ts;
+
+ if (tp->packets_out || !tcp_send_head(sk)) {
+ icsk->icsk_probes_out = 0;
+ return;
+ }
+
+ /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
+ * long as the receiver continues to respond probes. We support this by
+ * default and reset icsk_probes_out with incoming ACKs. But if the
+ * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
+ * kill the socket when the retry count and the time exceeds the
+ * corresponding system limit. We also implement similar policy when
+ * we use RTO to probe window in tcp_retransmit_timer().
+ */
+ start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+ if (!start_ts)
+ skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
+ else if (icsk->icsk_user_timeout &&
+ (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
+ goto abort;
+
+ max_probes = sysctl_tcp_retries2;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+
+ max_probes = tcp_orphan_retries(sk, alive);
+ if (!alive && icsk->icsk_backoff >= max_probes)
+ goto abort;
+ if (tcp_out_of_resources(sk, true))
+ return;
+ }
+
+ if (icsk->icsk_probes_out > max_probes) {
+abort: tcp_write_err(sk);
+ } else {
+ /* Only send another probe if we didn't close things up. */
+ tcp_send_probe0(sk);
+ }
+}
+
+/*
+ * Timer for Fast Open socket to retransmit SYNACK. Note that the
+ * sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int max_retries = icsk->icsk_syn_retries ? :
+ sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+ struct request_sock *req;
+
+ req = tcp_sk(sk)->fastopen_rsk;
+ req->rsk_ops->syn_ack_timeout(req);
+
+ if (req->num_timeout >= max_retries) {
+ tcp_write_err(sk);
+ return;
+ }
+ /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+ * returned from rtx_syn_ack() to make it more persistent like
+ * regular retransmit because if the child socket has been accepted
+ * it's not good to give up too easily.
+ */
+ inet_rtx_syn_ack(sk, req);
+ req->num_timeout++;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+}
+
+/*
+ * The TCP retransmit timer.
+ */
+
+void tcp_retransmit_timer(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (tp->fastopen_rsk) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+ tcp_fastopen_synack_timer(sk);
+ /* Before we receive ACK to our SYN-ACK don't retransmit
+ * anything else (e.g., data or FIN segments).
+ */
+ return;
+ }
+ if (!tp->packets_out)
+ goto out;
+
+ WARN_ON(tcp_write_queue_empty(sk));
+
+ tp->tlp_high_seq = 0;
+
+ if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+ !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+ /* Receiver dastardly shrinks window. Our retransmits
+ * become zero probes, but we should not timeout this
+ * connection. If the socket is an orphan, time it out,
+ * we cannot allow such beasts to hang infinitely.
+ */
+ struct inet_sock *inet = inet_sk(sk);
+ if (sk->sk_family == AF_INET) {
+ net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+ &inet->inet_daddr,
+ ntohs(inet->inet_dport),
+ inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+ &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport),
+ inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
+ }
+#endif
+ if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+ tcp_write_err(sk);
+ goto out;
+ }
+ tcp_enter_loss(sk);
+ tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
+ __sk_dst_reset(sk);
+ goto out_reset_timer;
+ }
+
+ if (tcp_write_timeout(sk))
+ goto out;
+
+ if (icsk->icsk_retransmits == 0) {
+ int mib_idx;
+
+ if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+ else
+ mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
+ } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
+ mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+ } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+ tp->sacked_out) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
+ } else {
+ mib_idx = LINUX_MIB_TCPTIMEOUTS;
+ }
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ }
+
+ tcp_enter_loss(sk);
+
+ if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
+ /* Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (!icsk->icsk_retransmits)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
+ goto out;
+ }
+
+ /* Increase the timeout each time we retransmit. Note that
+ * we do not increase the rtt estimate. rto is initialized
+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
+ * that doubling rto each time is the least we can get away with.
+ * In KA9Q, Karn uses this for the first few times, and then
+ * goes to quadratic. netBSD doubles, but only goes up to *64,
+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
+ * defined in the protocol as the maximum possible RTT. I guess
+ * we'll have to use something other than TCP to talk to the
+ * University of Mars.
+ *
+ * PAWS allows us longer timeouts and large windows, so once
+ * implemented ftp to mars will work nicely. We will have to fix
+ * the 120 second clamps though!
+ */
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
+
+out_reset_timer:
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+ * exponential backoff behaviour to avoid continue hammering
+ * linear-timeout retransmissions into a black hole
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+ tcp_stream_is_thin(tp) &&
+ icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+ __sk_dst_reset(sk);
+
+out:;
+}
+
+void tcp_write_timer_handler(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int event;
+
+ if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
+ goto out;
+
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+ goto out;
+ }
+
+ event = icsk->icsk_pending;
+
+ switch (event) {
+ case ICSK_TIME_EARLY_RETRANS:
+ tcp_resume_early_retransmit(sk);
+ break;
+ case ICSK_TIME_LOSS_PROBE:
+ tcp_send_loss_probe(sk);
+ break;
+ case ICSK_TIME_RETRANS:
+ icsk->icsk_pending = 0;
+ tcp_retransmit_timer(sk);
+ break;
+ case ICSK_TIME_PROBE0:
+ icsk->icsk_pending = 0;
+ tcp_probe_timer(sk);
+ break;
+ }
+
+out:
+ sk_mem_reclaim(sk);
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_write_timer_handler(sk);
+ } else {
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+void tcp_syn_ack_timeout(const struct request_sock *req)
+{
+ struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
+
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
+}
+EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ return;
+
+ if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ else if (!val)
+ inet_csk_delete_keepalive_timer(sk);
+}
+
+
+static void tcp_keepalive_timer (unsigned long data)
+{
+ struct sock *sk = (struct sock *) data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 elapsed;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ inet_csk_reset_keepalive_timer (sk, HZ/20);
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_LISTEN) {
+ pr_err("Hmm... keepalive on a LISTEN ???\n");
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+ if (tp->linger2 >= 0) {
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
+
+ if (tmo > 0) {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ goto death;
+ }
+
+ if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ elapsed = keepalive_time_when(tp);
+
+ /* It is alive without keepalive 8) */
+ if (tp->packets_out || tcp_send_head(sk))
+ goto resched;
+
+ elapsed = keepalive_time_elapsed(tp);
+
+ if (elapsed >= keepalive_time_when(tp)) {
+ /* If the TCP_USER_TIMEOUT option is enabled, use that
+ * to determine when to timeout instead.
+ */
+ if ((icsk->icsk_user_timeout != 0 &&
+ elapsed >= icsk->icsk_user_timeout &&
+ icsk->icsk_probes_out > 0) ||
+ (icsk->icsk_user_timeout == 0 &&
+ icsk->icsk_probes_out >= keepalive_probes(tp))) {
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_write_err(sk);
+ goto out;
+ }
+ if (tcp_write_wakeup(sk) <= 0) {
+ icsk->icsk_probes_out++;
+ elapsed = keepalive_intvl_when(tp);
+ } else {
+ /* If keepalive was lost due to local congestion,
+ * try harder.
+ */
+ elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ } else {
+ /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+ elapsed = keepalive_time_when(tp) - elapsed;
+ }
+
+ sk_mem_reclaim(sk);
+
+resched:
+ inet_csk_reset_keepalive_timer (sk, elapsed);
+ goto out;
+
+death:
+ tcp_done(sk);
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000..a6cea1d5e
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,337 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ * Lawrence S. Brakmo and Larry L. Peterson.
+ * "TCP Vegas: End to end congestion avoidance on a global internet."
+ * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ * October 1995. Available from:
+ * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ * o We do not change the loss detection or recovery mechanisms of
+ * Linux in any way. Linux already recovers from losses quite well,
+ * using fine-grained timers, NewReno, and FACK.
+ * o To avoid the performance penalty imposed by increasing cwnd
+ * only every-other RTT during slow start, we increase during
+ * every RTT during slow start, just like Reno.
+ * o Largely to allow continuous cwnd growth during slow start,
+ * we use the rate at which ACKs come back as the "actual"
+ * rate, rather than the rate at which data is sent.
+ * o To speed convergence to the right rate, we set the cwnd
+ * to achieve the right ("actual") rate when we exit slow start.
+ * o To filter out the noise caused by delayed ACKs, we use the
+ * minimum RTT sample observed during the last RTT to calculate
+ * the actual rate.
+ * o When the sender re-starts from idle, it waits until it has
+ * received ACKs for an entire flight of new data before making
+ * a cwnd adjustment decision. The original Vegas implementation
+ * assumed senders never went idle.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+static int alpha = 2;
+static int beta = 4;
+static int gamma = 1;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ * o when a connection is established
+ * o after an RTO
+ * o after fast recovery
+ * o when we send a packet and there is no outstanding
+ * unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static void vegas_enable(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ /* Begin taking Vegas samples next time we send something. */
+ vegas->doing_vegas_now = 1;
+
+ /* Set the beginning of the next send window. */
+ vegas->beg_snd_nxt = tp->snd_nxt;
+
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ vegas->doing_vegas_now = 0;
+}
+
+void tcp_vegas_init(struct sock *sk)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ vegas->baseRTT = 0x7fffffff;
+ vegas_enable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_init);
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ * o min-filter RTT samples from within an RTT to get the current
+ * propagation delay + queuing delay (we are min-filtering to try to
+ * avoid the effects of delayed ACKs)
+ * o min-filter RTT samples from a much longer window (forever for now)
+ * to find the propagation delay (baseRTT)
+ */
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+ u32 vrtt;
+
+ if (rtt_us < 0)
+ return;
+
+ /* Never allow zero rtt or baseRTT */
+ vrtt = rtt_us + 1;
+
+ /* Filter to find propagation delay: */
+ if (vrtt < vegas->baseRTT)
+ vegas->baseRTT = vrtt;
+
+ /* Find the min RTT during the last RTT to find
+ * the current prop. delay + queuing delay:
+ */
+ vegas->minRTT = min(vegas->minRTT, vrtt);
+ vegas->cntRTT++;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
+
+void tcp_vegas_state(struct sock *sk, u8 ca_state)
+{
+ if (ca_state == TCP_CA_Open)
+ vegas_enable(sk);
+ else
+ vegas_disable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_state);
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples. So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_CWND_RESTART ||
+ event == CA_EVENT_TX_START)
+ tcp_vegas_init(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
+
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+ return min(tp->snd_ssthresh, tp->snd_cwnd-1);
+}
+
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ if (!vegas->doing_vegas_now) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
+
+ if (after(ack, vegas->beg_snd_nxt)) {
+ /* Do the Vegas once-per-RTT cwnd adjustment. */
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ vegas->beg_snd_nxt = tp->snd_nxt;
+
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (vegas->cntRTT <= 2) {
+ /* We don't have enough RTT samples to do the Vegas
+ * calculation, so we'll behave like Reno.
+ */
+ tcp_reno_cong_avoid(sk, ack, acked);
+ } else {
+ u32 rtt, diff;
+ u64 target_cwnd;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = vegas->minRTT;
+
+ /* Calculate the cwnd we should have, if we weren't
+ * going too fast.
+ *
+ * This is:
+ * (actual rate in segments) * baseRTT
+ */
+ target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ do_div(target_cwnd, rtt);
+
+ /* Calculate the difference between the window we had,
+ * and the window we would like to have. This quantity
+ * is the "Diff" from the Arizona Vegas papers.
+ */
+ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
+
+ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* Going too fast. Time to slow down
+ * and switch to congestion avoidance.
+ */
+
+ /* Set cwnd to match the actual rate
+ * exactly:
+ * cwnd = (actual rate) * baseRTT
+ * Then we add 1 because the integer
+ * truncation robs us of full link
+ * utilization.
+ */
+ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+ tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+
+ } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* Slow start. */
+ tcp_slow_start(tp, acked);
+ } else {
+ /* Congestion avoidance. */
+
+ /* Figure out where we would like cwnd
+ * to be.
+ */
+ if (diff > beta) {
+ /* The old window was too fast, so
+ * we slow down.
+ */
+ tp->snd_cwnd--;
+ tp->snd_ssthresh
+ = tcp_vegas_ssthresh(tp);
+ } else if (diff < alpha) {
+ /* We don't have enough extra packets
+ * in the network, so speed up.
+ */
+ tp->snd_cwnd++;
+ } else {
+ /* Sending just as fast as we
+ * should be.
+ */
+ }
+ }
+
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
+
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ }
+
+ /* Wipe the slate clean for the next RTT. */
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+ }
+ /* Use normal slow start */
+ else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct vegas *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = ca->doing_vegas_now,
+ info->vegas.tcpv_rttcnt = ca->cntRTT,
+ info->vegas.tcpv_rtt = ca->baseRTT,
+ info->vegas.tcpv_minrtt = ca->minRTT,
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
+
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
+ .init = tcp_vegas_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_vegas_cong_avoid,
+ .pkts_acked = tcp_vegas_pkts_acked,
+ .set_state = tcp_vegas_state,
+ .cwnd_event = tcp_vegas_cwnd_event,
+ .get_info = tcp_vegas_get_info,
+
+ .owner = THIS_MODULE,
+ .name = "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_vegas);
+ return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
new file mode 100644
index 000000000..ef9da5306
--- /dev/null
+++ b/net/ipv4/tcp_vegas.h
@@ -0,0 +1,25 @@
+/*
+ * TCP Vegas congestion control interface
+ */
+#ifndef __TCP_VEGAS_H
+#define __TCP_VEGAS_H 1
+
+/* Vegas variables */
+struct vegas {
+ u32 beg_snd_nxt; /* right edge during last RTT */
+ u32 beg_snd_una; /* left edge during last RTT */
+ u32 beg_snd_cwnd; /* saves the size of the cwnd */
+ u8 doing_vegas_now;/* if true, do vegas for this RTT */
+ u16 cntRTT; /* # of RTTs measured within last RTT */
+ u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
+ u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
+
+#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 000000000..112151eee
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,232 @@
+/*
+ * TCP Veno congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ * C. P. Fu, S. C. Liew.
+ * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
+ * IEEE Journal on Selected Areas in Communication,
+ * Feb. 2003.
+ * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Veno variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static const int beta = 3 << V_PARAM_SHIFT;
+
+/* Veno variables */
+struct veno {
+ u8 doing_veno_now; /* if true, do veno for this rtt */
+ u16 cntrtt; /* # of rtts measured within last rtt */
+ u32 minrtt; /* min of rtts measured within last rtt (in usec) */
+ u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
+ u32 inc; /* decide whether to increase cwnd */
+ u32 diff; /* calculate the diff rate */
+};
+
+/* There are several situations when we must "re-start" Veno:
+ *
+ * o when a connection is established
+ * o after an RTO
+ * o after fast recovery
+ * o when we send a packet and there is no outstanding
+ * unacknowledged data (restarting an idle connection)
+ *
+ */
+static inline void veno_enable(struct sock *sk)
+{
+ struct veno *veno = inet_csk_ca(sk);
+
+ /* turn on Veno */
+ veno->doing_veno_now = 1;
+
+ veno->minrtt = 0x7fffffff;
+}
+
+static inline void veno_disable(struct sock *sk)
+{
+ struct veno *veno = inet_csk_ca(sk);
+
+ /* turn off Veno */
+ veno->doing_veno_now = 0;
+}
+
+static void tcp_veno_init(struct sock *sk)
+{
+ struct veno *veno = inet_csk_ca(sk);
+
+ veno->basertt = 0x7fffffff;
+ veno->inc = 1;
+ veno_enable(sk);
+}
+
+/* Do rtt sampling needed for Veno. */
+static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+ struct veno *veno = inet_csk_ca(sk);
+ u32 vrtt;
+
+ if (rtt_us < 0)
+ return;
+
+ /* Never allow zero rtt or baseRTT */
+ vrtt = rtt_us + 1;
+
+ /* Filter to find propagation delay: */
+ if (vrtt < veno->basertt)
+ veno->basertt = vrtt;
+
+ /* Find the min rtt during the last rtt to find
+ * the current prop. delay + queuing delay:
+ */
+ veno->minrtt = min(veno->minrtt, vrtt);
+ veno->cntrtt++;
+}
+
+static void tcp_veno_state(struct sock *sk, u8 ca_state)
+{
+ if (ca_state == TCP_CA_Open)
+ veno_enable(sk);
+ else
+ veno_disable(sk);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Veno calculations
+ * until we get fresh rtt samples. So when we
+ * restart, we reset our Veno state to a clean
+ * state. After we get acks for this flight of
+ * packets, _then_ we can make Veno calculations
+ * again.
+ */
+static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+ tcp_veno_init(sk);
+}
+
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct veno *veno = inet_csk_ca(sk);
+
+ if (!veno->doing_veno_now) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
+
+ /* limited by applications */
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* We do the Veno calculations only if we got enough rtt samples */
+ if (veno->cntrtt <= 2) {
+ /* We don't have enough rtt samples to do the Veno
+ * calculation, so we'll behave like Reno.
+ */
+ tcp_reno_cong_avoid(sk, ack, acked);
+ } else {
+ u64 target_cwnd;
+ u32 rtt;
+
+ /* We have enough rtt samples, so, using the Veno
+ * algorithm, we determine the state of the network.
+ */
+
+ rtt = veno->minrtt;
+
+ target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
+ target_cwnd <<= V_PARAM_SHIFT;
+ do_div(target_cwnd, rtt);
+
+ veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* Slow start. */
+ tcp_slow_start(tp, acked);
+ } else {
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ } else {
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ tp->snd_cwnd++;
+ veno->inc = 0;
+ } else
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+ }
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
+ }
+ /* Wipe the slate clean for the next rtt. */
+ /* veno->cntrtt = 0; */
+ veno->minrtt = 0x7fffffff;
+}
+
+/* Veno MD phase */
+static u32 tcp_veno_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct veno *veno = inet_csk_ca(sk);
+
+ if (veno->diff < beta)
+ /* in "non-congestive state", cut cwnd by 1/5 */
+ return max(tp->snd_cwnd * 4 / 5, 2U);
+ else
+ /* in "congestive state", cut cwnd by 1/2 */
+ return max(tp->snd_cwnd >> 1U, 2U);
+}
+
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
+ .init = tcp_veno_init,
+ .ssthresh = tcp_veno_ssthresh,
+ .cong_avoid = tcp_veno_cong_avoid,
+ .pkts_acked = tcp_veno_pkts_acked,
+ .set_state = tcp_veno_state,
+ .cwnd_event = tcp_veno_cwnd_event,
+
+ .owner = THIS_MODULE,
+ .name = "veno",
+};
+
+static int __init tcp_veno_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_veno);
+ return 0;
+}
+
+static void __exit tcp_veno_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_veno);
+}
+
+module_init(tcp_veno_register);
+module_exit(tcp_veno_unregister);
+
+MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000..c10732e39
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,305 @@
+/*
+ * TCP Westwood+: end-to-end bandwidth estimation for TCP
+ *
+ * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
+ *
+ * Support at http://c3lab.poliba.it/index.php/Westwood
+ * Main references in literature:
+ *
+ * - Mascolo S, Casetti, M. Gerla et al.
+ * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
+ *
+ * - A. Grieco, s. Mascolo
+ * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
+ * Comm. Review, 2004
+ *
+ * - A. Dell'Aera, L. Grieco, S. Mascolo.
+ * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
+ * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
+ *
+ * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
+ * ssthresh after packet loss. The probing phase is as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+ u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
+ u32 bw_est; /* bandwidth estimate */
+ u32 rtt_win_sx; /* here starts a new evaluation... */
+ u32 bk;
+ u32 snd_una; /* used for evaluating the number of acked bytes */
+ u32 cumul_ack;
+ u32 accounted;
+ u32 rtt;
+ u32 rtt_min; /* minimum observed RTT */
+ u8 first_ack; /* flag which infers that this is the first ack */
+ u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
+};
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
+#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct sock *sk)
+{
+ struct westwood *w = inet_csk_ca(sk);
+
+ w->bk = 0;
+ w->bw_ns_est = 0;
+ w->bw_est = 0;
+ w->accounted = 0;
+ w->cumul_ack = 0;
+ w->reset_rtt_min = 1;
+ w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+ w->rtt_win_sx = tcp_time_stamp;
+ w->snd_una = tcp_sk(sk)->snd_una;
+ w->first_ack = 1;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+ return ((7 * a) + b) >> 3;
+}
+
+static void westwood_filter(struct westwood *w, u32 delta)
+{
+ /* If the filter is empty fill it with the first sample of bandwidth */
+ if (w->bw_ns_est == 0 && w->bw_est == 0) {
+ w->bw_ns_est = w->bk / delta;
+ w->bw_est = w->bw_ns_est;
+ } else {
+ w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+ w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+ }
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+{
+ struct westwood *w = inet_csk_ca(sk);
+
+ if (rtt > 0)
+ w->rtt = usecs_to_jiffies(rtt);
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct sock *sk)
+{
+ struct westwood *w = inet_csk_ca(sk);
+ s32 delta = tcp_time_stamp - w->rtt_win_sx;
+
+ /* Initialize w->snd_una with the first acked sequence number in order
+ * to fix mismatch between tp->snd_una and w->snd_una for the first
+ * bandwidth sample
+ */
+ if (w->first_ack) {
+ w->snd_una = tcp_sk(sk)->snd_una;
+ w->first_ack = 0;
+ }
+
+ /*
+ * See if a RTT-window has passed.
+ * Be careful since if RTT is less than
+ * 50ms we don't filter but we continue 'building the sample'.
+ * This minimum limit was chosen since an estimation on small
+ * time intervals is better to avoid...
+ * Obviously on a LAN we reasonably will always have
+ * right_bound = left_bound + WESTWOOD_RTT_MIN
+ */
+ if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+ westwood_filter(w, delta);
+
+ w->bk = 0;
+ w->rtt_win_sx = tcp_time_stamp;
+ }
+}
+
+static inline void update_rtt_min(struct westwood *w)
+{
+ if (w->reset_rtt_min) {
+ w->rtt_min = w->rtt;
+ w->reset_rtt_min = 0;
+ } else
+ w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+
+ w->bk += tp->snd_una - w->snd_una;
+ w->snd_una = tp->snd_una;
+ update_rtt_min(w);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ w->cumul_ack = tp->snd_una - w->snd_una;
+
+ /* If cumul_ack is 0 this is a dupack since it's not moving
+ * tp->snd_una.
+ */
+ if (!w->cumul_ack) {
+ w->accounted += tp->mss_cache;
+ w->cumul_ack = tp->mss_cache;
+ }
+
+ if (w->cumul_ack > tp->mss_cache) {
+ /* Partial or delayed ack */
+ if (w->accounted >= w->cumul_ack) {
+ w->accounted -= w->cumul_ack;
+ w->cumul_ack = tp->mss_cache;
+ } else {
+ w->cumul_ack -= w->accounted;
+ w->accounted = 0;
+ }
+ }
+
+ w->snd_una = tp->snd_una;
+
+ return w->cumul_ack;
+}
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct westwood *w = inet_csk_ca(sk);
+
+ return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+ if (ack_flags & CA_ACK_SLOWPATH) {
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
+
+ update_rtt_min(w);
+ return;
+ }
+
+ westwood_fast_bw(sk);
+}
+
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ switch (event) {
+ case CA_EVENT_COMPLETE_CWR:
+ tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ break;
+ case CA_EVENT_LOSS:
+ tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ /* Update RTT_min when next ack arrives */
+ w->reset_rtt_min = 1;
+ break;
+ default:
+ /* don't care */
+ break;
+ }
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct westwood *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = 0;
+ info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt),
+ info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
+ .init = tcp_westwood_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .cwnd_event = tcp_westwood_event,
+ .in_ack_event = tcp_westwood_ack,
+ .get_info = tcp_westwood_info,
+ .pkts_acked = tcp_westwood_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 000000000..17d356629
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,255 @@
+/*
+ *
+ * YeAH TCP
+ *
+ * For further details look at:
+ * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ *
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */
+#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */
+#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */
+#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */
+#define TCP_YEAH_PHY 8 /* maximum delta from base */
+#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */
+#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */
+
+#define TCP_SCALABLE_AI_CNT 100U
+
+/* YeAH variables */
+struct yeah {
+ struct vegas vegas; /* must be first */
+
+ /* YeAH */
+ u32 lastQ;
+ u32 doing_reno_now;
+
+ u32 reno_count;
+ u32 fast_count;
+
+ u32 pkts_acked;
+};
+
+static void tcp_yeah_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ tcp_vegas_init(sk);
+
+ yeah->doing_reno_now = 0;
+ yeah->lastQ = 0;
+
+ yeah->reno_count = 2;
+
+ /* Ensure the MD arithmetic works. This is somewhat pedantic,
+ * since I don't think we will see a cwnd this large. :) */
+ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ if (icsk->icsk_ca_state == TCP_CA_Open)
+ yeah->pkts_acked = pkts_acked;
+
+ tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+}
+
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+
+ else if (!yeah->doing_reno_now) {
+ /* Scalable */
+
+ tp->snd_cwnd_cnt += yeah->pkts_acked;
+ if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ }
+
+ yeah->pkts_acked = 1;
+
+ } else {
+ /* Reno */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ }
+
+ /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
+ *
+ * These are so named because they represent the approximate values
+ * of snd_una and snd_nxt at the beginning of the current RTT. More
+ * precisely, they represent the amount of data sent during the RTT.
+ * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+ * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
+ * bytes of data have been ACKed during the course of the RTT, giving
+ * an "actual" rate of:
+ *
+ * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
+ *
+ * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
+ * because delayed ACKs can cover more than one segment, so they
+ * don't line up yeahly with the boundaries of RTTs.
+ *
+ * Another unfortunate fact of life is that delayed ACKs delay the
+ * advance of the left edge of our send window, so that the number
+ * of bytes we send in an RTT is often less than our cwnd will allow.
+ * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+ */
+
+ if (after(ack, yeah->vegas.beg_snd_nxt)) {
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (yeah->vegas.cntRTT > 2) {
+ u32 rtt, queue;
+ u64 bw;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = yeah->vegas.minRTT;
+
+ /* Compute excess number of packets above bandwidth
+ * Avoid doing full 64 bit divide.
+ */
+ bw = tp->snd_cwnd;
+ bw *= rtt - yeah->vegas.baseRTT;
+ do_div(bw, rtt);
+ queue = bw;
+
+ if (queue > TCP_YEAH_ALPHA ||
+ rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
+ if (queue > TCP_YEAH_ALPHA &&
+ tp->snd_cwnd > yeah->reno_count) {
+ u32 reduction = min(queue / TCP_YEAH_GAMMA ,
+ tp->snd_cwnd >> TCP_YEAH_EPSILON);
+
+ tp->snd_cwnd -= reduction;
+
+ tp->snd_cwnd = max(tp->snd_cwnd,
+ yeah->reno_count);
+
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+
+ if (yeah->reno_count <= 2)
+ yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
+ else
+ yeah->reno_count++;
+
+ yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
+ 0xffffffU);
+ } else {
+ yeah->fast_count++;
+
+ if (yeah->fast_count > TCP_YEAH_ZETA) {
+ yeah->reno_count = 2;
+ yeah->fast_count = 0;
+ }
+
+ yeah->doing_reno_now = 0;
+ }
+
+ yeah->lastQ = queue;
+ }
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
+ yeah->vegas.beg_snd_nxt = tp->snd_nxt;
+ yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
+
+ /* Wipe the slate clean for the next RTT. */
+ yeah->vegas.cntRTT = 0;
+ yeah->vegas.minRTT = 0x7fffffff;
+ }
+}
+
+static u32 tcp_yeah_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+ u32 reduction;
+
+ if (yeah->doing_reno_now < TCP_YEAH_RHO) {
+ reduction = yeah->lastQ;
+
+ reduction = min(reduction, max(tp->snd_cwnd>>1, 2U));
+
+ reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+ } else
+ reduction = max(tp->snd_cwnd>>1, 2U);
+
+ yeah->fast_count = 0;
+ yeah->reno_count = max(yeah->reno_count>>1, 2U);
+
+ return tp->snd_cwnd - reduction;
+}
+
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
+ .init = tcp_yeah_init,
+ .ssthresh = tcp_yeah_ssthresh,
+ .cong_avoid = tcp_yeah_cong_avoid,
+ .set_state = tcp_vegas_state,
+ .cwnd_event = tcp_vegas_cwnd_event,
+ .get_info = tcp_vegas_get_info,
+ .pkts_acked = tcp_yeah_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "yeah",
+};
+
+static int __init tcp_yeah_register(void)
+{
+ BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_yeah);
+ return 0;
+}
+
+static void __exit tcp_yeah_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_yeah);
+}
+
+module_init(tcp_yeah_register);
+module_exit(tcp_yeah_unregister);
+
+MODULE_AUTHOR("Angelo P. Castellani");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
new file mode 100644
index 000000000..0d0171830
--- /dev/null
+++ b/net/ipv4/tunnel4.c
@@ -0,0 +1,192 @@
+/* tunnel4.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
+static DEFINE_MUTEX(tunnel4_mutex);
+
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
+{
+ return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
+}
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ mutex_lock(&tunnel4_mutex);
+
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&tunnel4_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+ int ret = -ENOENT;
+
+ mutex_lock(&tunnel4_mutex);
+
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ mutex_unlock(&tunnel4_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+static int tunnel4_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tunnel64_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+static void tunnel4_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void tunnel64_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+#endif
+
+static const struct net_protocol tunnel4_protocol = {
+ .handler = tunnel4_rcv,
+ .err_handler = tunnel4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static const struct net_protocol tunnel64_protocol = {
+ .handler = tunnel64_rcv,
+ .err_handler = tunnel64_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+#endif
+
+static int __init tunnel4_init(void)
+{
+ if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
+ pr_err("tunnel64 init: can't add protocol\n");
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+ return -EAGAIN;
+ }
+#endif
+ return 0;
+}
+
+static void __exit tunnel4_fini(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
+ pr_err("tunnel64 close: can't remove protocol\n");
+#endif
+ if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+ pr_err("tunnel4 close: can't remove protocol\n");
+}
+
+module_init(tunnel4_init);
+module_exit(tunnel4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 000000000..83aa604f9
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,2555 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The User Datagram Protocol (UDP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() calls
+ * Alan Cox : stopped close while in use off icmp
+ * messages. Not a fix but a botch that
+ * for udp at least is 'valid'.
+ * Alan Cox : Fixed icmp handling properly
+ * Alan Cox : Correct error for oversized datagrams
+ * Alan Cox : Tidied select() semantics.
+ * Alan Cox : udp_err() fixed properly, also now
+ * select and read wake correctly on errors
+ * Alan Cox : udp_send verify_area moved to avoid mem leak
+ * Alan Cox : UDP can count its memory
+ * Alan Cox : send to an unknown connection causes
+ * an ECONNREFUSED off the icmp, but
+ * does NOT close.
+ * Alan Cox : Switched to new sk_buff handlers. No more backlog!
+ * Alan Cox : Using generic datagram code. Even smaller and the PEEK
+ * bug no longer crashes it.
+ * Fred Van Kempen : Net2e support for sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram
+ * Alan Cox : Added get/set sockopt support.
+ * Alan Cox : Broadcasting without option set returns EACCES.
+ * Alan Cox : No wakeup calls. Instead we now use the callbacks.
+ * Alan Cox : Use ip_tos and ip_ttl
+ * Alan Cox : SNMP Mibs
+ * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
+ * Matt Dillon : UDP length checks.
+ * Alan Cox : Smarter af_inet used properly.
+ * Alan Cox : Use new kernel side addressing.
+ * Alan Cox : Incorrect return on truncated datagram receive.
+ * Arnt Gulbrandsen : New udp_send and stuff
+ * Alan Cox : Cache last socket
+ * Alan Cox : Route cache
+ * Jon Peatfield : Minor efficiency fix to sendto().
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Nonblocking error fix.
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * David S. Miller : New socket lookup architecture.
+ * Last socket cache retained as it
+ * does have a high hit rate.
+ * Olaf Kirch : Don't linearise iovec on sendmsg.
+ * Andi Kleen : Some cleanups, cache destination entry
+ * for connect.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Melvin Smith : Check msg_name not msg_namelen in sendto(),
+ * return ENOTCONN for unconnected sockets (POSIX)
+ * Janos Farkas : don't deliver multi/broadcasts to a different
+ * bound-to-device socket
+ * Hirokazu Takahashi : HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi : sendfile() on UDP works now.
+ * Arnaldo C. Melo : convert /proc/net/udp to seq_file
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ * James Chapman : Add L2TP encapsulation type.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "UDP: " fmt
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/tcp_states.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/route.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <trace/events/udp.h>
+#include <linux/static_key.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
+#include "udp_impl.h"
+
+struct udp_table udp_table __read_mostly;
+EXPORT_SYMBOL(udp_table);
+
+long sysctl_udp_mem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_mem);
+
+int sysctl_udp_rmem_min __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_rmem_min);
+
+int sysctl_udp_wmem_min __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_wmem_min);
+
+atomic_long_t udp_memory_allocated;
+EXPORT_SYMBOL(udp_memory_allocated);
+
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+
+static int udp_lib_lport_inuse(struct net *net, __u16 num,
+ const struct udp_hslot *hslot,
+ unsigned long *bitmap,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2),
+ unsigned int log)
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+
+ sk_nulls_for_each(sk2, node, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
+ saddr_comp(sk, sk2)) {
+ if (!bitmap)
+ return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+ struct udp_hslot *hslot2,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+ int res = 0;
+
+ spin_lock(&hslot2->lock);
+ udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
+ saddr_comp(sk, sk2)) {
+ res = 1;
+ break;
+ }
+ }
+ spin_unlock(&hslot2->lock);
+ return res;
+}
+
+/**
+ * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
+ *
+ * @sk: socket struct in question
+ * @snum: port number to look up
+ * @saddr_comp: AF-dependent comparison of bound local IP addresses
+ * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
+ * with NULL address
+ */
+int udp_lib_get_port(struct sock *sk, unsigned short snum,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2),
+ unsigned int hash2_nulladdr)
+{
+ struct udp_hslot *hslot, *hslot2;
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ int error = 1;
+ struct net *net = sock_net(sk);
+
+ if (!snum) {
+ int low, high, remaining;
+ unsigned int rand;
+ unsigned short first, last;
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+
+ rand = prandom_u32();
+ first = reciprocal_scale(rand, remaining) + low;
+ /*
+ * force rand to be an odd multiple of UDP_HTABLE_SIZE
+ */
+ rand = (rand | 1) * (udptable->mask + 1);
+ last = first + udptable->mask + 1;
+ do {
+ hslot = udp_hashslot(udptable, net, first);
+ bitmap_zero(bitmap, PORTS_PER_CHAIN);
+ spin_lock_bh(&hslot->lock);
+ udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
+ saddr_comp, udptable->log);
+
+ snum = first;
+ /*
+ * Iterate on all possible values of snum for this hash.
+ * Using steps of an odd multiple of UDP_HTABLE_SIZE
+ * give us randomization and full range coverage.
+ */
+ do {
+ if (low <= snum && snum <= high &&
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_local_reserved_port(net, snum))
+ goto found;
+ snum += rand;
+ } while (snum != first);
+ spin_unlock_bh(&hslot->lock);
+ } while (++first != last);
+ goto fail;
+ } else {
+ hslot = udp_hashslot(udptable, net, snum);
+ spin_lock_bh(&hslot->lock);
+ if (hslot->count > 10) {
+ int exist;
+ unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+ slot2 &= udptable->mask;
+ hash2_nulladdr &= udptable->mask;
+
+ hslot2 = udp_hashslot2(udptable, slot2);
+ if (hslot->count < hslot2->count)
+ goto scan_primary_hash;
+
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ if (!exist && (hash2_nulladdr != slot2)) {
+ hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ }
+ if (exist)
+ goto fail_unlock;
+ else
+ goto found;
+ }
+scan_primary_hash:
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
+ saddr_comp, 0))
+ goto fail_unlock;
+ }
+found:
+ inet_sk(sk)->inet_num = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
+ if (sk_unhashed(sk)) {
+ sk_nulls_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
+ }
+ error = 0;
+fail_unlock:
+ spin_unlock_bh(&hslot->lock);
+fail:
+ return error;
+}
+EXPORT_SYMBOL(udp_lib_get_port);
+
+static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
+{
+ struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
+
+ return (!ipv6_only_sock(sk2) &&
+ (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
+ inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
+}
+
+static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
+ unsigned int port)
+{
+ return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
+}
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+ unsigned int hash2_nulladdr =
+ udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+ unsigned int hash2_partial =
+ udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+}
+
+static inline int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, unsigned short hnum, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
+{
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ ipv6_only_sock(sk))
+ return -1;
+
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+ inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr) {
+ if (inet->inet_rcv_saddr != daddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 4;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+
+ return score;
+}
+
+/*
+ * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
+ */
+static inline int compute_score2(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif)
+{
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ ipv6_only_sock(sk))
+ return -1;
+
+ inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr != daddr ||
+ inet->inet_num != hnum)
+ return -1;
+
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 4;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+
+ return score;
+}
+
+static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 udp_ehash_secret __read_mostly;
+
+ net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ udp_ehash_secret + net_hash_mix(net));
+}
+
+/* called with read_rcu_lock() */
+static struct sock *udp4_lib_lookup2(struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif,
+ struct udp_hslot *hslot2, unsigned int slot2)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
+
+begin:
+ result = NULL;
+ badness = 0;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ score = compute_score2(sk, net, saddr, sport,
+ daddr, hnum, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (reciprocal_scale(hash, matches) == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot2)
+ goto begin;
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(compute_score2(result, net, saddr, sport,
+ daddr, hnum, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ return result;
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+ __be16 sport, __be32 daddr, __be16 dport,
+ int dif, struct udp_table *udptable)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(dport);
+ unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
+
+ rcu_read_lock();
+ if (hslot->count > 10) {
+ hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ if (!result) {
+ hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ htonl(INADDR_ANY), hnum, dif,
+ hslot2, slot2);
+ }
+ rcu_read_unlock();
+ return result;
+ }
+begin:
+ result = NULL;
+ badness = 0;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ score = compute_score(sk, net, saddr, hnum, sport,
+ daddr, dport, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (reciprocal_scale(hash, matches) == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+ daddr, dport, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
+
+static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport,
+ struct udp_table *udptable)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ udptable);
+}
+
+struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
+{
+ return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+
+static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, unsigned short hnum)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(sk) ||
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ return false;
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ return false;
+ return true;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code.
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header. We need
+ * to find the appropriate port.
+ */
+
+void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+{
+ struct inet_sock *inet;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct sock *sk;
+ int harderr;
+ int err;
+ struct net *net = dev_net(skb->dev);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
+ iph->saddr, uh->source, skb->dev->ifindex, udptable);
+ if (!sk) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return; /* No socket for error */
+ }
+
+ err = 0;
+ harderr = 0;
+ inet = inet_sk(sk);
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ ipv4_sk_redirect(skb, sk);
+ goto out;
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if (!inet->recverr) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else
+ ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+
+void udp_err(struct sk_buff *skb, u32 info)
+{
+ __udp4_lib_err(skb, info, &udp_table);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+void udp_flush_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+
+ if (up->pending) {
+ up->len = 0;
+ up->pending = 0;
+ ip_flush_pending_frames(sk);
+ }
+}
+EXPORT_SYMBOL(udp_flush_pending_frames);
+
+/**
+ * udp4_hwcsum - handle outgoing HW checksumming
+ * @skb: sk_buff containing the filled-in UDP header
+ * (checksum field must be zeroed out)
+ * @src: source IP address
+ * @dst: destination IP address
+ */
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int hlen = len;
+ __wsum csum = 0;
+
+ if (!skb_has_frag_list(skb)) {
+ /*
+ * Only one fragment on the socket.
+ */
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~csum_tcpudp_magic(src, dst, len,
+ IPPROTO_UDP, 0);
+ } else {
+ struct sk_buff *frags;
+
+ /*
+ * HW-checksum won't work as there are two or more
+ * fragments on the socket so that all csums of sk_buffs
+ * should be together
+ */
+ skb_walk_frags(skb, frags) {
+ csum = csum_add(csum, frags->csum);
+ hlen -= frags->len;
+ }
+
+ csum = skb_checksum(skb, offset, hlen, csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+}
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
+
+/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
+ */
+void udp_set_csum(bool nocheck, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, int len)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ else if (skb_dst(skb) && skb_dst(skb)->dev &&
+ (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ } else {
+ __wsum csum;
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, len, 0);
+ uh->check = udp_v4_check(len, saddr, daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+}
+EXPORT_SYMBOL(udp_set_csum);
+
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct udphdr *uh;
+ int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ __wsum csum = 0;
+
+ /*
+ * Create a UDP header
+ */
+ uh = udp_hdr(skb);
+ uh->source = inet->inet_sport;
+ uh->dest = fl4->fl4_dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ if (is_udplite) /* UDP-Lite */
+ csum = udplite_csum(skb);
+
+ else if (sk->sk_no_check_tx) { /* UDP csum disabled */
+
+ skb->ip_summed = CHECKSUM_NONE;
+ goto send;
+
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+
+ udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
+ goto send;
+
+ } else
+ csum = udp_csum(skb);
+
+ /* add protocol-dependent pseudo-header */
+ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
+ sk->sk_protocol, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+send:
+ err = ip_send_skb(sock_net(sk), skb);
+ if (err) {
+ if (err == -ENOBUFS && !inet->recverr) {
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ err = 0;
+ }
+ } else
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
+ return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+int udp_push_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ goto out;
+
+ err = udp_send_skb(skb, fl4);
+
+out:
+ up->len = 0;
+ up->pending = 0;
+ return err;
+}
+EXPORT_SYMBOL(udp_push_pending_frames);
+
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ struct flowi4 fl4_stack;
+ struct flowi4 *fl4;
+ int ulen = len;
+ struct ipcm_cookie ipc;
+ struct rtable *rt = NULL;
+ int free = 0;
+ int connected = 0;
+ __be32 daddr, faddr, saddr;
+ __be16 dport;
+ u8 tos;
+ int err, is_udplite = IS_UDPLITE(sk);
+ int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sk_buff *skb;
+ struct ip_options_data opt_copy;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
+ return -EOPNOTSUPP;
+
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+
+ fl4 = &inet->cork.fl.u.ip4;
+ if (up->pending) {
+ /*
+ * There are pending frames.
+ * The socket lock must be held while it's corked.
+ */
+ lock_sock(sk);
+ if (likely(up->pending)) {
+ if (unlikely(up->pending != AF_INET)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ goto do_append_data;
+ }
+ release_sock(sk);
+ }
+ ulen += sizeof(struct udphdr);
+
+ /*
+ * Get and verify the address.
+ */
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET) {
+ if (usin->sin_family != AF_UNSPEC)
+ return -EAFNOSUPPORT;
+ }
+
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ if (dport == 0)
+ return -EINVAL;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->inet_daddr;
+ dport = inet->inet_dport;
+ /* Open fast path for connected socket.
+ Route will not be used, if at least one option is set.
+ */
+ connected = 1;
+ }
+ ipc.addr = inet->inet_saddr;
+
+ ipc.oif = sk->sk_bound_dev_if;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc,
+ sk->sk_family == AF_INET6);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ connected = 0;
+ }
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ faddr = ipc.opt->opt.faddr;
+ connected = 0;
+ }
+ tos = get_rttos(&ipc, inet);
+ if (sock_flag(sk, SOCK_LOCALROUTE) ||
+ (msg->msg_flags & MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->opt.is_strictroute)) {
+ tos |= RTO_ONLINK;
+ connected = 0;
+ }
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ connected = 0;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ if (connected)
+ rt = (struct rtable *)sk_dst_check(sk, 0);
+
+ if (!rt) {
+ struct net *net = sock_net(sk);
+
+ fl4 = &fl4_stack;
+ flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ faddr, saddr, dport, inet->inet_sport);
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+ if (connected)
+ sk_dst_set(sk, dst_clone(&rt->dst));
+ }
+
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ saddr = fl4->saddr;
+ if (!ipc.addr)
+ daddr = ipc.addr = fl4->daddr;
+
+ /* Lockless fast path for the non-corking case. */
+ if (!corkreq) {
+ skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ msg->msg_flags);
+ err = PTR_ERR(skb);
+ if (!IS_ERR_OR_NULL(skb))
+ err = udp_send_skb(skb, fl4);
+ goto out;
+ }
+
+ lock_sock(sk);
+ if (unlikely(up->pending)) {
+ /* The socket is already corked while preparing it. */
+ /* ... which is an evident application bug. --ANK */
+ release_sock(sk);
+
+ net_dbg_ratelimited("cork app bug 2\n");
+ err = -EINVAL;
+ goto out;
+ }
+ /*
+ * Now cork the socket to pend data.
+ */
+ fl4 = &inet->cork.fl.u.ip4;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->fl4_dport = dport;
+ fl4->fl4_sport = inet->inet_sport;
+ up->pending = AF_INET;
+
+do_append_data:
+ up->len += ulen;
+ err = ip_append_data(sk, fl4, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ if (err)
+ udp_flush_pending_frames(sk);
+ else if (!corkreq)
+ err = udp_push_pending_frames(sk);
+ else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+ up->pending = 0;
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err)
+ return len;
+ /*
+ * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
+ * ENOBUFS might not be good (it's not tunable per se), but otherwise
+ * we don't have a good statistic (IpOutDiscards but it can be too many
+ * things). We could add another new stat but at least for now that
+ * seems like overkill.
+ */
+ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags&MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+EXPORT_SYMBOL(udp_sendmsg);
+
+int udp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ int ret;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ if (!up->pending) {
+ struct msghdr msg = { .msg_flags = flags|MSG_MORE };
+
+ /* Call udp_sendmsg to specify destination address which
+ * sendpage interface can't pass.
+ * This will succeed only when the socket is connected.
+ */
+ ret = udp_sendmsg(sk, &msg, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ lock_sock(sk);
+
+ if (unlikely(!up->pending)) {
+ release_sock(sk);
+
+ net_dbg_ratelimited("udp cork app bug 3\n");
+ return -EINVAL;
+ }
+
+ ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
+ page, offset, size, flags);
+ if (ret == -EOPNOTSUPP) {
+ release_sock(sk);
+ return sock_no_sendpage(sk->sk_socket, page, offset,
+ size, flags);
+ }
+ if (ret < 0) {
+ udp_flush_pending_frames(sk);
+ goto out;
+ }
+
+ up->len += size;
+ if (!(up->corkflag || (flags&MSG_MORE)))
+ ret = udp_push_pending_frames(sk);
+ if (!ret)
+ ret = size;
+out:
+ release_sock(sk);
+ return ret;
+}
+
+/**
+ * first_packet_length - return length of first packet in receive queue
+ * @sk: socket
+ *
+ * Drops all bad checksum frames, until a valid one is found.
+ * Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+ struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+ struct sk_buff *skb;
+ unsigned int res;
+
+ __skb_queue_head_init(&list_kill);
+
+ spin_lock_bh(&rcvq->lock);
+ while ((skb = skb_peek(rcvq)) != NULL &&
+ udp_lib_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ atomic_inc(&sk->sk_drops);
+ __skb_unlink(skb, rcvq);
+ __skb_queue_tail(&list_kill, skb);
+ }
+ res = skb ? skb->len : 0;
+ spin_unlock_bh(&rcvq->lock);
+
+ if (!skb_queue_empty(&list_kill)) {
+ bool slow = lock_sock_fast(sk);
+
+ __skb_queue_purge(&list_kill);
+ sk_mem_reclaim_partial(sk);
+ unlock_sock_fast(sk, slow);
+ }
+ return res;
+}
+
+/*
+ * IOCTL requests applicable to the UDP protocol
+ */
+
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+
+ case SIOCINQ:
+ {
+ unsigned int amount = first_packet_length(sk);
+
+ if (amount)
+ /*
+ * We will only return the amount
+ * of this packet since that is all
+ * that will be read.
+ */
+ amount -= sizeof(struct udphdr);
+
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(udp_ioctl);
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sk_buff *skb;
+ unsigned int ulen, copied;
+ int peeked, off = 0;
+ int err;
+ int is_udplite = IS_UDPLITE(sk);
+ bool slow;
+
+ if (flags & MSG_ERRQUEUE)
+ return ip_recv_error(sk, msg, len, addr_len);
+
+try_again:
+ skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, &off, &err);
+ if (!skb)
+ goto out;
+
+ ulen = skb->len - sizeof(struct udphdr);
+ copied = len;
+ if (copied > ulen)
+ copied = ulen;
+ else if (copied < ulen)
+ msg->msg_flags |= MSG_TRUNC;
+
+ /*
+ * If checksum is needed at all, try to do it while copying the
+ * data. If the data is truncated, or if we only want a partial
+ * coverage checksum (UDP-Lite), do it before the copy.
+ */
+
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_lib_checksum_complete(skb))
+ goto csum_copy_err;
+ }
+
+ if (skb_csum_unnecessary(skb))
+ err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+ msg, copied);
+ else {
+ err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
+ msg);
+
+ if (err == -EINVAL)
+ goto csum_copy_err;
+ }
+
+ if (unlikely(err)) {
+ trace_kfree_skb(skb, udp_recvmsg);
+ if (!peeked) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
+ }
+ goto out_free;
+ }
+
+ if (!peeked)
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = udp_hdr(skb)->source;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
+
+ err = copied;
+ if (flags & MSG_TRUNC)
+ err = ulen;
+
+out_free:
+ skb_free_datagram_locked(sk, skb);
+out:
+ return err;
+
+csum_copy_err:
+ slow = lock_sock_fast(sk);
+ if (!skb_kill_datagram(sk, skb, flags)) {
+ UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ }
+ unlock_sock_fast(sk, slow);
+
+ /* starting over for a new packet, but check if we need to yield */
+ cond_resched();
+ msg->msg_flags &= ~MSG_TRUNC;
+ goto try_again;
+}
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ /*
+ * 1003.1g - break association.
+ */
+
+ sk->sk_state = TCP_CLOSE;
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ sock_rps_reset_rxhash(sk);
+ sk->sk_bound_dev_if = 0;
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+ sk->sk_prot->unhash(sk);
+ inet->inet_sport = 0;
+ }
+ sk_dst_reset(sk);
+ return 0;
+}
+EXPORT_SYMBOL(udp_disconnect);
+
+void udp_lib_unhash(struct sock *sk)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_hslot *hslot, *hslot2;
+
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+
+ spin_lock_bh(&hslot->lock);
+ if (sk_nulls_del_node_init_rcu(sk)) {
+ hslot->count--;
+ inet_sk(sk)->inet_num = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+}
+EXPORT_SYMBOL(udp_lib_unhash);
+
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ nhslot2 = udp_hashslot2(udptable, newhash);
+ udp_sk(sk)->udp_portaddr_hash = newhash;
+ if (hslot2 != nhslot2) {
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ /* we must lock primary chain too */
+ spin_lock_bh(&hslot->lock);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+ }
+ }
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+ u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ udp_lib_rehash(sk, new_hash);
+}
+
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ if (inet_sk(sk)->inet_daddr) {
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ }
+
+ rc = sock_queue_rcv_skb(sk, skb);
+ if (rc < 0) {
+ int is_udplite = IS_UDPLITE(sk);
+
+ /* Note that an ENOMEM error is charged twice */
+ if (rc == -ENOMEM)
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ kfree_skb(skb);
+ trace_udp_fail_queue_rcv_skb(rc, sk);
+ return -1;
+ }
+
+ return 0;
+
+}
+
+static struct static_key udp_encap_needed __read_mostly;
+void udp_encap_enable(void)
+{
+ if (!static_key_enabled(&udp_encap_needed))
+ static_key_slow_inc(&udp_encap_needed);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* returns:
+ * -1: error
+ * 0: success
+ * >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int rc;
+ int is_udplite = IS_UDPLITE(sk);
+
+ /*
+ * Charge it to the socket, dropping if the queue is full.
+ */
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto drop;
+ nf_reset(skb);
+
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+ /*
+ * This is an encapsulation socket so pass the skb to
+ * the socket's udp_encap_rcv() hook. Otherwise, just
+ * fall through and pass this up the UDP socket.
+ * up->encap_rcv() returns the following value:
+ * =0 if skb was successfully passed to the encap
+ * handler or was discarded by it.
+ * >0 if skb should be passed on to UDP.
+ * <0 if skb should be resubmitted as proto -N
+ */
+
+ /* if we're overly short, let UDP handle it */
+ encap_rcv = ACCESS_ONCE(up->encap_rcv);
+ if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+ int ret;
+
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ ret = encap_rcv(sk, skb);
+ if (ret <= 0) {
+ UDP_INC_STATS_BH(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
+ return -ret;
+ }
+ }
+
+ /* FALLTHROUGH -- it's a UDP Packet */
+ }
+
+ /*
+ * UDP-Lite specific tests, ignored on UDP sockets
+ */
+ if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+
+ /*
+ * MIB statistics other than incrementing the error count are
+ * disabled for the following two types of errors: these depend
+ * on the application settings, not on the functioning of the
+ * protocol stack as such.
+ *
+ * RFC 3828 here recommends (sec 3.3): "There should also be a
+ * way ... to ... at least let the receiving application block
+ * delivery of packets with coverage values less than a value
+ * provided by the application."
+ */
+ if (up->pcrlen == 0) { /* full coverage was set */
+ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
+ goto drop;
+ }
+ /* The next case involves violating the min. coverage requested
+ * by the receiver. This is subtle: if receiver wants x and x is
+ * greater than the buffersize/MTU then receiver will complain
+ * that it wants x while sender emits packets of smaller size y.
+ * Therefore the above ...()->partial_cov statement is essential.
+ */
+ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ goto drop;
+ }
+ }
+
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ goto drop;
+ }
+
+ rc = 0;
+
+ ipv4_pktinfo_prepare(sk, skb);
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ rc = __udp_queue_rcv_skb(sk, skb);
+ else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ bh_unlock_sock(sk);
+ goto drop;
+ }
+ bh_unlock_sock(sk);
+
+ return rc;
+
+csum_error:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+drop:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return -1;
+}
+
+static void flush_stack(struct sock **stack, unsigned int count,
+ struct sk_buff *skb, unsigned int final)
+{
+ unsigned int i;
+ struct sk_buff *skb1 = NULL;
+ struct sock *sk;
+
+ for (i = 0; i < count; i++) {
+ sk = stack[i];
+ if (likely(!skb1))
+ skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb1) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ }
+
+ if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
+ skb1 = NULL;
+
+ sock_put(sk);
+ }
+ if (unlikely(skb1))
+ kfree_skb(skb1);
+}
+
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use xchg() to guard against concurrent changes.
+ */
+static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+ struct dst_entry *old;
+
+ dst_hold(dst);
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+}
+
+/*
+ * Multicasts and broadcasts go to each listener.
+ *
+ * Note: called only from the BH handler context.
+ */
+static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+ struct udphdr *uh,
+ __be32 saddr, __be32 daddr,
+ struct udp_table *udptable,
+ int proto)
+{
+ struct sock *sk, *stack[256 / sizeof(struct sock *)];
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ int dif = skb->dev->ifindex;
+ unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+ bool inner_flushed = false;
+
+ if (use_hash2) {
+ hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ udp_table.mask;
+ hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+ hslot = &udp_table.hash2[hash2];
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
+
+ spin_lock(&hslot->lock);
+ sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
+ if (__udp_is_mcast_sock(net, sk,
+ uh->dest, daddr,
+ uh->source, saddr,
+ dif, hnum)) {
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ flush_stack(stack, count, skb, ~0);
+ inner_flushed = true;
+ count = 0;
+ }
+ stack[count++] = sk;
+ sock_hold(sk);
+ }
+ }
+
+ spin_unlock(&hslot->lock);
+
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
+ /*
+ * do the slow work with no lock held
+ */
+ if (count) {
+ flush_stack(stack, count, skb, count - 1);
+ } else {
+ if (!inner_flushed)
+ UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
+ consume_skb(skb);
+ }
+ return 0;
+}
+
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
+ int proto)
+{
+ int err;
+
+ UDP_SKB_CB(skb)->partial_cov = 0;
+ UDP_SKB_CB(skb)->cscov = skb->len;
+
+ if (proto == IPPROTO_UDPLITE) {
+ err = udplite_checksum_init(skb, uh);
+ if (err)
+ return err;
+ }
+
+ return skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
+}
+
+/*
+ * All we need to do is get the socket, and then do a checksum.
+ */
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ int proto)
+{
+ struct sock *sk;
+ struct udphdr *uh;
+ unsigned short ulen;
+ struct rtable *rt = skb_rtable(skb);
+ __be32 saddr, daddr;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Validate the packet.
+ */
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto drop; /* No space for header. */
+
+ uh = udp_hdr(skb);
+ ulen = ntohs(uh->len);
+ saddr = ip_hdr(skb)->saddr;
+ daddr = ip_hdr(skb)->daddr;
+
+ if (ulen > skb->len)
+ goto short_packet;
+
+ if (proto == IPPROTO_UDP) {
+ /* UDP validates ulen. */
+ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
+ goto short_packet;
+ uh = udp_hdr(skb);
+ }
+
+ if (udp4_csum_init(skb, uh, proto))
+ goto csum_error;
+
+ sk = skb_steal_sock(skb);
+ if (sk) {
+ struct dst_entry *dst = skb_dst(skb);
+ int ret;
+
+ if (unlikely(sk->sk_rx_dst != dst))
+ udp_sk_rx_dst_set(sk, dst);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ }
+
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable, proto);
+
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (sk) {
+ int ret;
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_compute_pseudo);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ }
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ nf_reset(skb);
+
+ /* No socket. Drop packet silently, if checksum is wrong */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ /*
+ * Hmm. We got an UDP packet to a port to which we
+ * don't wanna listen. Ignore it.
+ */
+ kfree_skb(skb);
+ return 0;
+
+short_packet:
+ net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source),
+ ulen, skb->len,
+ &daddr, ntohs(uh->dest));
+ goto drop;
+
+csum_error:
+ /*
+ * RFC1122: OK. Discards the bad packet silently (as far as
+ * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+ */
+ net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
+ ulen);
+ UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+drop:
+ UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ kfree_skb(skb);
+ return 0;
+}
+
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ struct udp_hslot *hslot = &udp_table.hash[slot];
+
+ /* Do not bother scanning a too big list */
+ if (hslot->count > 10)
+ return NULL;
+
+ rcu_read_lock();
+begin:
+ count = 0;
+ result = NULL;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum)) {
+ result = sk;
+ ++count;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (count != 1 ||
+ unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!__udp_is_mcast_sock(net, result,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups. The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int slot2 = hash2 & udp_table.mask;
+ struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
+ const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+ rcu_read_lock();
+ result = NULL;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr, ports, dif))
+ result = sk;
+ /* Only check first socket in chain */
+ break;
+ }
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr,
+ ports, dif))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+void udp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph;
+ const struct udphdr *uh;
+ struct sock *sk;
+ struct dst_entry *dst;
+ int dif = skb->dev->ifindex;
+ int ours;
+
+ /* validate the packet */
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+ if (!in_dev)
+ return;
+
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return;
+ sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ } else if (skb->pkt_type == PACKET_HOST) {
+ sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ } else {
+ return;
+ }
+
+ if (!sk)
+ return;
+
+ skb->sk = sk;
+ skb->destructor = sock_efree;
+ dst = sk->sk_rx_dst;
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst)
+ skb_dst_set_noref(skb, dst);
+}
+
+int udp_rcv(struct sk_buff *skb)
+{
+ return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+}
+
+void udp_destroy_sock(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ bool slow = lock_sock_fast(sk);
+ udp_flush_pending_frames(sk);
+ unlock_sock_fast(sk, slow);
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+}
+
+/*
+ * Socket option code for UDP
+ */
+int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen,
+ int (*push_pending_frames)(struct sock *))
+{
+ struct udp_sock *up = udp_sk(sk);
+ int val, valbool;
+ int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ switch (optname) {
+ case UDP_CORK:
+ if (val != 0) {
+ up->corkflag = 1;
+ } else {
+ up->corkflag = 0;
+ lock_sock(sk);
+ push_pending_frames(sk);
+ release_sock(sk);
+ }
+ break;
+
+ case UDP_ENCAP:
+ switch (val) {
+ case 0:
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ up->encap_rcv = xfrm4_udp_encap_rcv;
+ /* FALLTHROUGH */
+ case UDP_ENCAP_L2TPINUDP:
+ up->encap_type = val;
+ udp_encap_enable();
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case UDP_NO_CHECK6_TX:
+ up->no_check6_tx = valbool;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ up->no_check6_rx = valbool;
+ break;
+
+ /*
+ * UDP-Lite's partial checksum coverage (RFC 3828).
+ */
+ /* The sender sets actual checksum coverage length via this option.
+ * The case coverage > packet length is handled by send module. */
+ case UDPLITE_SEND_CSCOV:
+ if (!is_udplite) /* Disable the option on UDP sockets */
+ return -ENOPROTOOPT;
+ if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
+ val = 8;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
+ up->pcslen = val;
+ up->pcflag |= UDPLITE_SEND_CC;
+ break;
+
+ /* The receiver specifies a minimum checksum coverage value. To make
+ * sense, this should be set to at least 8 (as done below). If zero is
+ * used, this again means full checksum coverage. */
+ case UDPLITE_RECV_CSCOV:
+ if (!is_udplite) /* Disable the option on UDP sockets */
+ return -ENOPROTOOPT;
+ if (val != 0 && val < 8) /* Avoid silly minimal values. */
+ val = 8;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
+ up->pcrlen = val;
+ up->pcflag |= UDPLITE_RECV_CC;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(udp_lib_setsockopt);
+
+int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ udp_push_pending_frames);
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ udp_push_pending_frames);
+ return compat_ip_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int val, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case UDP_CORK:
+ val = up->corkflag;
+ break;
+
+ case UDP_ENCAP:
+ val = up->encap_type;
+ break;
+
+ case UDP_NO_CHECK6_TX:
+ val = up->no_check6_tx;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ val = up->no_check6_rx;
+ break;
+
+ /* The following two cannot be changed on UDP sockets, the return is
+ * always 0 (which corresponds to the full checksum coverage of UDP). */
+ case UDPLITE_SEND_CSCOV:
+ val = up->pcslen;
+ break;
+
+ case UDPLITE_RECV_CSCOV:
+ val = up->pcrlen;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(udp_lib_getsockopt);
+
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+ return compat_ip_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+/**
+ * udp_poll - wait for a UDP event.
+ * @file - file struct
+ * @sock - socket
+ * @wait - poll table
+ *
+ * This is same as datagram poll, except for the special case of
+ * blocking sockets. If application is using a blocking fd
+ * and a packet with checksum error is in the queue;
+ * then it could get return from select indicating data available
+ * but then block when reading it. Add special case code
+ * to work around these arguably broken applications.
+ */
+unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask = datagram_poll(file, sock, wait);
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* Check for false positives due to checksum errors */
+ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+ mask &= ~(POLLIN | POLLRDNORM);
+
+ return mask;
+
+}
+EXPORT_SYMBOL(udp_poll);
+
+struct proto udp_prot = {
+ .name = "UDP",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .destroy = udp_destroy_sock,
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
+ .sendpage = udp_sendpage,
+ .backlog_rcv = __udp_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
+ .get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
+ .obj_size = sizeof(struct udp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .h.udp_table = &udp_table,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_udp_setsockopt,
+ .compat_getsockopt = compat_udp_getsockopt,
+#endif
+ .clear_sk = sk_prot_clear_portaddr_nulls,
+};
+EXPORT_SYMBOL(udp_prot);
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static struct sock *udp_get_first(struct seq_file *seq, int start)
+{
+ struct sock *sk;
+ struct udp_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = start; state->bucket <= state->udp_table->mask;
+ ++state->bucket) {
+ struct hlist_nulls_node *node;
+ struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+
+ if (hlist_nulls_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_nulls_for_each(sk, node, &hslot->head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_family == state->family)
+ goto found;
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct udp_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ do {
+ sk = sk_nulls_next(sk);
+ } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+
+ if (!sk) {
+ if (state->bucket <= state->udp_table->mask)
+ spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+ return udp_get_first(seq, state->bucket + 1);
+ }
+ return sk;
+}
+
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = udp_get_first(seq, 0);
+
+ if (sk)
+ while (pos && (sk = udp_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct udp_iter_state *state = seq->private;
+ state->bucket = MAX_UDP_PORTS;
+
+ return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+
+static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = udp_get_idx(seq, 0);
+ else
+ sk = udp_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+
+static void udp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct udp_iter_state *state = seq->private;
+
+ if (state->bucket <= state->udp_table->mask)
+ spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+}
+
+int udp_seq_open(struct inode *inode, struct file *file)
+{
+ struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
+ struct udp_iter_state *s;
+ int err;
+
+ err = seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct udp_iter_state));
+ if (err < 0)
+ return err;
+
+ s = ((struct seq_file *)file->private_data)->private;
+ s->family = afinfo->family;
+ s->udp_table = afinfo->udp_table;
+ return err;
+}
+EXPORT_SYMBOL(udp_seq_open);
+
+/* ------------------------------------------------------------------------ */
+int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
+{
+ struct proc_dir_entry *p;
+ int rc = 0;
+
+ afinfo->seq_ops.start = udp_seq_start;
+ afinfo->seq_ops.next = udp_seq_next;
+ afinfo->seq_ops.stop = udp_seq_stop;
+
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ rc = -ENOMEM;
+ return rc;
+}
+EXPORT_SYMBOL(udp_proc_register);
+
+void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(udp_proc_unregister);
+
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ atomic_read(&sp->sk_drops));
+}
+
+int udp4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct udp_iter_state *state = seq->private;
+
+ udp4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct file_operations udp_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+/* ------------------------------------------------------------------------ */
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+ .name = "udp",
+ .family = AF_INET,
+ .udp_table = &udp_table,
+ .seq_fops = &udp_afinfo_seq_fops,
+ .seq_ops = {
+ .show = udp4_seq_show,
+ },
+};
+
+static int __net_init udp4_proc_init_net(struct net *net)
+{
+ return udp_proc_register(net, &udp4_seq_afinfo);
+}
+
+static void __net_exit udp4_proc_exit_net(struct net *net)
+{
+ udp_proc_unregister(net, &udp4_seq_afinfo);
+}
+
+static struct pernet_operations udp4_net_ops = {
+ .init = udp4_proc_init_net,
+ .exit = udp4_proc_exit_net,
+};
+
+int __init udp4_proc_init(void)
+{
+ return register_pernet_subsys(&udp4_net_ops);
+}
+
+void udp4_proc_exit(void)
+{
+ unregister_pernet_subsys(&udp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtoul(str, 0, &uhash_entries);
+ if (ret)
+ return 0;
+
+ if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+ uhash_entries = UDP_HTABLE_SIZE_MIN;
+ return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
+
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+ unsigned int i;
+
+ table->hash = alloc_large_system_hash(name,
+ 2 * sizeof(struct udp_hslot),
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ UDP_HTABLE_SIZE_MIN,
+ 64 * 1024);
+
+ table->hash2 = table->hash + (table->mask + 1);
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ table->hash[i].count = 0;
+ spin_lock_init(&table->hash[i].lock);
+ }
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ table->hash2[i].count = 0;
+ spin_lock_init(&table->hash2[i].lock);
+ }
+}
+
+u32 udp_flow_hashrnd(void)
+{
+ static u32 hashrnd __read_mostly;
+
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+ return hashrnd;
+}
+EXPORT_SYMBOL(udp_flow_hashrnd);
+
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ udp_table_init(&udp_table, "UDP");
+ limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_udp_mem[0] = limit / 4 * 3;
+ sysctl_udp_mem[1] = limit;
+ sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+ sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
new file mode 100644
index 000000000..b763c39ae
--- /dev/null
+++ b/net/ipv4/udp_diag.c
@@ -0,0 +1,222 @@
+/*
+ * udp_diag.c Module for monitoring UDP transport protocols sockets.
+ *
+ * Authors: Pavel Emelyanov, <xemul@parallels.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/sock_diag.h>
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, req,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ int err = -EINVAL;
+ struct sock *sk;
+ struct sk_buff *rep;
+ struct net *net = sock_net(in_skb->sk);
+
+ if (req->sdiag_family == AF_INET)
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_if, tbl);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6)
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ req->id.idiag_if, tbl);
+#endif
+ else
+ goto out_nosk;
+
+ err = -ENOENT;
+ if (!sk)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, NULL, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ if (sk)
+ sock_put(sk);
+out_nosk:
+ return err;
+}
+
+static void udp_dump(struct udp_table *table, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ int num, s_num, slot, s_slot;
+ struct net *net = sock_net(skb->sk);
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ struct udp_hslot *hslot = &table->hash[slot];
+
+ num = 0;
+
+ if (hlist_nulls_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_nulls_for_each(sk, node, &hslot->head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(&hslot->lock);
+ goto done;
+ }
+next:
+ num++;
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+done:
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ udp_dump(&udp_table, skb, cb, r, bc);
+}
+
+static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udp_table, in_skb, nlh, req);
+}
+
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+static const struct inet_diag_handler udp_diag_handler = {
+ .dump = udp_diag_dump,
+ .dump_one = udp_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDP,
+};
+
+static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
+{
+ udp_dump(&udplite_table, skb, cb, r, bc);
+}
+
+static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udplite_table, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler udplite_diag_handler = {
+ .dump = udplite_diag_dump,
+ .dump_one = udplite_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDPLITE,
+};
+
+static int __init udp_diag_init(void)
+{
+ int err;
+
+ err = inet_diag_register(&udp_diag_handler);
+ if (err)
+ goto out;
+ err = inet_diag_register(&udplite_diag_handler);
+ if (err)
+ goto out_lite;
+out:
+ return err;
+out_lite:
+ inet_diag_unregister(&udp_diag_handler);
+ goto out;
+}
+
+static void __exit udp_diag_exit(void)
+{
+ inet_diag_unregister(&udplite_diag_handler);
+ inet_diag_unregister(&udp_diag_handler);
+}
+
+module_init(udp_diag_init);
+module_exit(udp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
new file mode 100644
index 000000000..7e0fe4bdd
--- /dev/null
+++ b/net/ipv4/udp_impl.h
@@ -0,0 +1,34 @@
+#ifndef _UDP4_IMPL_H
+#define _UDP4_IMPL_H
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
+
+int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+
+#ifdef CONFIG_COMPAT
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+#endif
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len);
+int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
+ int flags);
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udp_destroy_sock(struct sock *sk);
+
+#ifdef CONFIG_PROC_FS
+int udp4_seq_show(struct seq_file *seq, void *v);
+#endif
+#endif /* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
new file mode 100644
index 000000000..f9386160c
--- /dev/null
+++ b/net/ipv4/udp_offload.c
@@ -0,0 +1,442 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * UDPv4 GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/udp.h>
+#include <net/protocol.h>
+
+static DEFINE_SPINLOCK(udp_offload_lock);
+static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
+
+#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
+
+struct udp_offload_priv {
+ struct udp_offload *offload;
+ struct rcu_head rcu;
+ struct udp_offload_priv __rcu *next;
+};
+
+static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features),
+ __be16 new_protocol, bool is_ipv6)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ __be16 protocol = skb->protocol;
+ netdev_features_t enc_features;
+ int udp_offset, outer_hlen;
+ unsigned int oldlen;
+ bool need_csum = !!(skb_shinfo(skb)->gso_type &
+ SKB_GSO_UDP_TUNNEL_CSUM);
+ bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ bool offload_csum = false, dont_encap = (need_csum || remcsum);
+
+ oldlen = (u16)~skb->len;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = new_protocol;
+ skb->encap_hdr_csum = need_csum;
+ skb->remcsum_offload = remcsum;
+
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ (skb->dev->features &
+ (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM)));
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & features;
+ segs = gso_inner_segment(skb, enc_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
+
+ outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
+ struct udphdr *uh;
+ int len;
+ __be32 delta;
+
+ if (dont_encap) {
+ skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* Only set up inner headers if we might be offloading
+ * inner checksum.
+ */
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
+ uh = udp_hdr(skb);
+ uh->len = htons(len);
+
+ if (!need_csum)
+ continue;
+
+ delta = htonl(oldlen + len);
+
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ if (offload_csum) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ } else if (remcsum) {
+ /* Need to calculate checksum from scratch,
+ * inner checksums are never when doing
+ * remote_checksum_offload.
+ */
+
+ skb->csum = skb_checksum(skb, udp_offset,
+ skb->len - udp_offset,
+ 0);
+ uh->check = csum_fold(skb->csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
+ uh->check = gso_make_checksum(skb, ~uh->check);
+
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ bool is_ipv6)
+{
+ __be16 protocol = skb->protocol;
+ const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features);
+
+ rcu_read_lock();
+
+ switch (skb->inner_protocol_type) {
+ case ENCAP_TYPE_ETHER:
+ protocol = skb->inner_protocol;
+ gso_inner_segment = skb_mac_gso_segment;
+ break;
+ case ENCAP_TYPE_IPPROTO:
+ offloads = is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[skb->inner_ipproto]);
+ if (!ops || !ops->callbacks.gso_segment)
+ goto out_unlock;
+ gso_inner_segment = ops->callbacks.gso_segment;
+ break;
+ default:
+ goto out_unlock;
+ }
+
+ segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
+ protocol, is_ipv6);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return segs;
+}
+
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+ __wsum csum;
+ struct udphdr *uh;
+ struct iphdr *iph;
+
+ if (skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+ segs = skb_udp_tunnel_segment(skb, features, false);
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto out;
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
+ !(type & (SKB_GSO_UDP))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+
+ uh = udp_hdr(skb);
+ iph = ip_hdr(skb);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Fragment the skb. IP headers of the fragments are updated in
+ * inet_gso_segment()
+ */
+ segs = skb_segment(skb, features);
+out:
+ return segs;
+}
+
+int udp_add_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
+
+ if (!new_offload)
+ return -ENOMEM;
+
+ new_offload->offload = uo;
+
+ spin_lock(&udp_offload_lock);
+ new_offload->next = udp_offload_base;
+ rcu_assign_pointer(udp_offload_base, new_offload);
+ spin_unlock(&udp_offload_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(udp_add_offload);
+
+static void udp_offload_free_routine(struct rcu_head *head)
+{
+ struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
+ kfree(ou_priv);
+}
+
+void udp_del_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv __rcu **head = &udp_offload_base;
+ struct udp_offload_priv *uo_priv;
+
+ spin_lock(&udp_offload_lock);
+
+ uo_priv = udp_deref_protected(*head);
+ for (; uo_priv != NULL;
+ uo_priv = udp_deref_protected(*head)) {
+ if (uo_priv->offload == uo) {
+ rcu_assign_pointer(*head,
+ udp_deref_protected(uo_priv->next));
+ goto unlock;
+ }
+ head = &uo_priv->next;
+ }
+ pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
+unlock:
+ spin_unlock(&udp_offload_lock);
+ if (uo_priv)
+ call_rcu(&uo_priv->rcu, udp_offload_free_routine);
+}
+EXPORT_SYMBOL(udp_del_offload);
+
+struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
+ struct udphdr *uh)
+{
+ struct udp_offload_priv *uo_priv;
+ struct sk_buff *p, **pp = NULL;
+ struct udphdr *uh2;
+ unsigned int off = skb_gro_offset(skb);
+ int flush = 1;
+
+ if (NAPI_GRO_CB(skb)->udp_mark ||
+ (skb->ip_summed != CHECKSUM_PARTIAL &&
+ NAPI_GRO_CB(skb)->csum_cnt == 0 &&
+ !NAPI_GRO_CB(skb)->csum_valid))
+ goto out;
+
+ /* mark that this skb passed once through the udp gro layer */
+ NAPI_GRO_CB(skb)->udp_mark = 1;
+
+ rcu_read_lock();
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_receive)
+ goto unflush;
+ }
+ goto out_unlock;
+
+unflush:
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = (struct udphdr *)(p->data + off);
+
+ /* Match ports and either checksums are either both zero
+ * or nonzero.
+ */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
+ (!uh->check ^ !uh2->check)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+ NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
+ pp = uo_priv->offload->callbacks.gro_receive(head, skb,
+ uo_priv->offload);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+ return pp;
+}
+
+static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+
+ if (unlikely(!uh))
+ goto flush;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip;
+
+ if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo))
+ goto flush;
+ else if (uh->check)
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo);
+skip:
+ NAPI_GRO_CB(skb)->is_ipv6 = 0;
+ return udp_gro_receive(head, skb, uh);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+int udp_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct udp_offload_priv *uo_priv;
+ __be16 newlen = htons(skb->len - nhoff);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ int err = -ENOSYS;
+
+ uh->len = newlen;
+
+ rcu_read_lock();
+
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_complete)
+ break;
+ }
+
+ if (uo_priv) {
+ NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
+ err = uo_priv->offload->callbacks.gro_complete(skb,
+ nhoff + sizeof(struct udphdr),
+ uo_priv->offload);
+ }
+
+ rcu_read_unlock();
+
+ if (skb->remcsum_offload)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+
+ skb->encapsulation = 1;
+ skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr));
+
+ return err;
+}
+
+static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+ if (uh->check) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
+ iph->daddr, 0);
+ } else {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+ }
+
+ return udp_gro_complete(skb, nhoff);
+}
+
+static const struct net_offload udpv4_offload = {
+ .callbacks = {
+ .gso_segment = udp4_ufo_fragment,
+ .gro_receive = udp4_gro_receive,
+ .gro_complete = udp4_gro_complete,
+ },
+};
+
+int __init udpv4_offload_init(void)
+{
+ return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000..6bb98cc19
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,108 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+
+int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ int err;
+ struct socket *sock = NULL;
+ struct sockaddr_in udp_addr;
+
+ err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->local_ip;
+ udp_addr.sin_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->peer_ip;
+ udp_addr.sin_port = cfg->peer_udp_port;
+ err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr), 0);
+ if (err < 0)
+ goto error;
+ }
+
+ sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL(udp_sock_create4);
+
+void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
+ struct udp_tunnel_sock_cfg *cfg)
+{
+ struct sock *sk = sock->sk;
+
+ /* Disable multicast loopback */
+ inet_sk(sk)->mc_loop = 0;
+
+ /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+ inet_inc_convert_csum(sk);
+
+ rcu_assign_sk_user_data(sk, cfg->sk_user_data);
+
+ udp_sk(sk)->encap_type = cfg->encap_type;
+ udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+
+ udp_tunnel_encap_enable(sock);
+}
+EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+
+int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 tos, __u8 ttl,
+ __be16 df, __be16 src_port, __be16 dst_port,
+ bool xnet, bool nocheck)
+{
+ struct udphdr *uh;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+ uh->len = htons(skb->len);
+
+ udp_set_csum(nocheck, skb, src, dst, skb->len);
+
+ return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,
+ tos, ttl, df, xnet);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
+
+void udp_tunnel_sock_release(struct socket *sock)
+{
+ rcu_assign_sk_user_data(sock->sk, NULL);
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
new file mode 100644
index 000000000..3b3efbda4
--- /dev/null
+++ b/net/ipv4/udplite.c
@@ -0,0 +1,141 @@
+/*
+ * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828).
+ *
+ * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *
+ * Changes:
+ * Fixes:
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "UDPLite: " fmt
+
+#include <linux/export.h>
+#include "udp_impl.h"
+
+struct udp_table udplite_table __read_mostly;
+EXPORT_SYMBOL(udplite_table);
+
+static int udplite_rcv(struct sk_buff *skb)
+{
+ return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
+}
+
+static void udplite_err(struct sk_buff *skb, u32 info)
+{
+ __udp4_lib_err(skb, info, &udplite_table);
+}
+
+static const struct net_protocol udplite_protocol = {
+ .handler = udplite_rcv,
+ .err_handler = udplite_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+struct proto udplite_prot = {
+ .name = "UDP-Lite",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .init = udplite_sk_init,
+ .destroy = udp_destroy_sock,
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
+ .sendpage = udp_sendpage,
+ .backlog_rcv = udp_queue_rcv_skb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .get_port = udp_v4_get_port,
+ .obj_size = sizeof(struct udp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .h.udp_table = &udplite_table,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_udp_setsockopt,
+ .compat_getsockopt = compat_udp_getsockopt,
+#endif
+ .clear_sk = sk_prot_clear_portaddr_nulls,
+};
+EXPORT_SYMBOL(udplite_prot);
+
+static struct inet_protosw udplite4_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDPLITE,
+ .prot = &udplite_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
+};
+
+#ifdef CONFIG_PROC_FS
+
+static const struct file_operations udplite_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct udp_seq_afinfo udplite4_seq_afinfo = {
+ .name = "udplite",
+ .family = AF_INET,
+ .udp_table = &udplite_table,
+ .seq_fops = &udplite_afinfo_seq_fops,
+ .seq_ops = {
+ .show = udp4_seq_show,
+ },
+};
+
+static int __net_init udplite4_proc_init_net(struct net *net)
+{
+ return udp_proc_register(net, &udplite4_seq_afinfo);
+}
+
+static void __net_exit udplite4_proc_exit_net(struct net *net)
+{
+ udp_proc_unregister(net, &udplite4_seq_afinfo);
+}
+
+static struct pernet_operations udplite4_net_ops = {
+ .init = udplite4_proc_init_net,
+ .exit = udplite4_proc_exit_net,
+};
+
+static __init int udplite4_proc_init(void)
+{
+ return register_pernet_subsys(&udplite4_net_ops);
+}
+#else
+static inline int udplite4_proc_init(void)
+{
+ return 0;
+}
+#endif
+
+void __init udplite4_register(void)
+{
+ udp_table_init(&udplite_table, "UDP-Lite");
+ if (proto_register(&udplite_prot, 1))
+ goto out_register_err;
+
+ if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0)
+ goto out_unregister_proto;
+
+ inet_register_protosw(&udplite4_protosw);
+
+ if (udplite4_proc_init())
+ pr_err("%s: Cannot register /proc!\n", __func__);
+ return;
+
+out_unregister_proto:
+ proto_unregister(&udplite_prot);
+out_register_err:
+ pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
+}
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 000000000..60b032f58
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,158 @@
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ * Derek Atkins <derek@ihtfp.com>
+ * Add Encapsulation support
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return xfrm4_extract_header(skb);
+}
+
+static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
+{
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
+ goto drop;
+ }
+ return dst_input(skb);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+
+#ifndef CONFIG_NETFILTER
+ if (!async)
+ return -iph->protocol;
+#endif
+
+ __skb_push(skb, skb->data - skb_network_header(skb));
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
+ xfrm4_rcv_encap_finish);
+ return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh;
+ struct iphdr *iph;
+ int iphlen, len;
+
+ __u8 *udpdata;
+ __be32 *udpdata32;
+ __u16 encap_type = up->encap_type;
+
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+
+ /* If this is a paged skb, make sure we pull up
+ * whatever data we need to look at. */
+ len = skb->len - sizeof(struct udphdr);
+ if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+ return 1;
+
+ /* Now we can get the pointers */
+ uh = udp_hdr(skb);
+ udpdata = (__u8 *)uh + sizeof(struct udphdr);
+ udpdata32 = (__be32 *)udpdata;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ goto drop;
+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+ /* ESP Packet without Non-ESP header */
+ len = sizeof(struct udphdr);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ goto drop;
+ } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+ udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+ /* ESP Packet with Non-IKE marker */
+ len = sizeof(struct udphdr) + 2 * sizeof(u32);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ }
+
+ /* At this point we are sure that this is an ESPinUDP packet,
+ * so we need to remove 'len' bytes from the packet (the UDP
+ * header and optional ESP marker bytes) and then modify the
+ * protocol to ESP, and then call into the transform receiver.
+ */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ goto drop;
+
+ /* Now we can update and verify the packet length... */
+ iph = ip_hdr(skb);
+ iphlen = iph->ihl << 2;
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (skb->len < iphlen + len) {
+ /* packet is too small!?! */
+ goto drop;
+ }
+
+ /* pull the data buffer up to the ESP header and set the
+ * transport header to point to ESP. Keep UDP on the stack
+ * for later.
+ */
+ __skb_pull(skb, len);
+ skb_reset_transport_header(skb);
+
+ /* process ESP */
+ return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+int xfrm4_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
+}
+EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
new file mode 100644
index 000000000..71acd0014
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -0,0 +1,156 @@
+/*
+ * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
+ * Miika Komu <miika@iki.fi>
+ * Herbert Xu <herbert@gondor.apana.org.au>
+ * Abhinav Pathak <abhinav.pathak@hiit.fi>
+ * Jeff Ahrenholz <ahrenholz@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static void xfrm4_beet_make_header(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->ihl = 5;
+ iph->version = 4;
+
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+ iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+
+ iph->id = XFRM_MODE_SKB_CB(skb)->id;
+ iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off;
+ iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl;
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
+ */
+static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_beet_phdr *ph;
+ struct iphdr *top_iph;
+ int hdrlen, optlen;
+
+ hdrlen = 0;
+ optlen = XFRM_MODE_SKB_CB(skb)->optlen;
+ if (unlikely(optlen))
+ hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
+
+ skb_set_network_header(skb, -x->props.header_len -
+ hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
+ if (x->sel.family != AF_INET6)
+ skb->network_header += IPV4_BEET_PHMAXLEN;
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+ xfrm4_beet_make_header(skb);
+
+ ph = (struct ip_beet_phdr *)
+ __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
+
+ top_iph = ip_hdr(skb);
+
+ if (unlikely(optlen)) {
+ BUG_ON(optlen < 0);
+
+ ph->padlen = 4 - (optlen & 4);
+ ph->hdrlen = optlen / 8;
+ ph->nexthdr = top_iph->protocol;
+ if (ph->padlen)
+ memset(ph + 1, IPOPT_NOP, ph->padlen);
+
+ top_iph->protocol = IPPROTO_BEETPH;
+ top_iph->ihl = sizeof(struct iphdr) / 4;
+ }
+
+ top_iph->saddr = x->props.saddr.a4;
+ top_iph->daddr = x->id.daddr.a4;
+
+ return 0;
+}
+
+static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ int optlen = 0;
+ int err = -EINVAL;
+
+ if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
+ struct ip_beet_phdr *ph;
+ int phlen;
+
+ if (!pskb_may_pull(skb, sizeof(*ph)))
+ goto out;
+
+ ph = (struct ip_beet_phdr *)skb->data;
+
+ phlen = sizeof(*ph) + ph->padlen;
+ optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
+ if (optlen < 0 || optlen & 3 || optlen > 250)
+ goto out;
+
+ XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
+
+ if (!pskb_may_pull(skb, phlen))
+ goto out;
+ __skb_pull(skb, phlen);
+ }
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ xfrm4_beet_make_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->ihl += optlen / 4;
+ iph->tot_len = htons(skb->len);
+ iph->daddr = x->sel.daddr.a4;
+ iph->saddr = x->sel.saddr.a4;
+ iph->check = 0;
+ iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ err = 0;
+out:
+ return err;
+}
+
+static struct xfrm_mode xfrm4_beet_mode = {
+ .input2 = xfrm4_beet_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm4_beet_output,
+ .output = xfrm4_prepare_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_BEET,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm4_beet_init(void)
+{
+ return xfrm_register_mode(&xfrm4_beet_mode, AF_INET);
+}
+
+static void __exit xfrm4_beet_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET);
+ BUG_ON(err);
+}
+
+module_init(xfrm4_beet_init);
+module_exit(xfrm4_beet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000..fd840c7d7
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,80 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ */
+static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ int ihl = iph->ihl * 4;
+
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + ihl;
+ __skb_pull(skb, ihl);
+ memmove(skb_network_header(skb), iph, ihl);
+ return 0;
+}
+
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is. skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int ihl = skb->data - skb_transport_header(skb);
+
+ if (skb->transport_header != skb->network_header) {
+ memmove(skb_transport_header(skb),
+ skb_network_header(skb), ihl);
+ skb->network_header = skb->transport_header;
+ }
+ ip_hdr(skb)->tot_len = htons(skb->len + ihl);
+ skb_reset_transport_header(skb);
+ return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+ .input = xfrm4_transport_input,
+ .output = xfrm4_transport_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+ return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+ BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000..35feda676
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+ struct iphdr *inner_iph = ipip_hdr(skb);
+
+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+ IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.
+ */
+static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct iphdr *top_iph;
+ int flags;
+
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+ top_iph = ip_hdr(skb);
+
+ top_iph->ihl = 5;
+ top_iph->version = 4;
+
+ top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
+
+ /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */
+ if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
+ top_iph->tos = 0;
+ else
+ top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+ top_iph->tos = INET_ECN_encapsulate(top_iph->tos,
+ XFRM_MODE_SKB_CB(skb)->tos);
+
+ flags = x->props.flags;
+ if (flags & XFRM_STATE_NOECN)
+ IP_ECN_clear(top_iph);
+
+ top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+ 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
+
+ top_iph->ttl = ip4_dst_hoplimit(dst->child);
+
+ top_iph->saddr = x->props.saddr.a4;
+ top_iph->daddr = x->id.daddr.a4;
+ ip_select_ident(dev_net(dst->dev), skb, NULL);
+
+ return 0;
+}
+
+static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = -EINVAL;
+
+ if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
+ goto out;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
+ goto out;
+
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ ipip_ecn_decapsulate(skb);
+
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+ .input2 = xfrm4_mode_tunnel_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm4_mode_tunnel_output,
+ .output = xfrm4_prepare_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_TUNNEL,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm4_mode_tunnel_init(void)
+{
+ return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_mode_tunnel_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+ BUG_ON(err);
+}
+
+module_init(xfrm4_mode_tunnel_init);
+module_exit(xfrm4_mode_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 000000000..2878dbfff
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,111 @@
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+ int mtu, ret = 0;
+
+ if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+ goto out;
+
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
+ goto out;
+
+ mtu = dst_mtu(skb_dst(skb));
+ if (skb->len > mtu) {
+ if (skb->sk)
+ xfrm_local_error(skb, mtu);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ ret = -EMSGSIZE;
+ }
+out:
+ return ret;
+}
+
+int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ err = xfrm4_tunnel_check_size(skb);
+ if (err)
+ return err;
+
+ XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
+
+ return xfrm4_extract_header(skb);
+}
+
+int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ err = xfrm_inner_extract_output(x, skb);
+ if (err)
+ return err;
+
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+ skb->protocol = htons(ETH_P_IP);
+
+ return x->outer_mode->output2(x, skb);
+}
+EXPORT_SYMBOL(xfrm4_prepare_output);
+
+int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
+{
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+#ifdef CONFIG_NETFILTER
+ IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
+
+ return xfrm_output(sk, skb);
+}
+
+static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+#ifdef CONFIG_NETFILTER
+ if (!x) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output_sk(sk, skb);
+ }
+#endif
+
+ return x->outer_mode->afinfo->output_finish(sk, skb);
+}
+
+int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+{
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
+ NULL, skb_dst(skb)->dev, __xfrm4_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
+{
+ struct iphdr *hdr;
+
+ hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+ ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
+ inet_sk(skb->sk)->inet_dport, mtu);
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 000000000..bff69746e
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,332 @@
+/*
+ * xfrm4_policy.c
+ *
+ * Changes:
+ * Kazunori MIYAZAWA @USAGI
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/inetdevice.h>
+#include <linux/if_tunnel.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
+
+static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
+ int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct rtable *rt;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = daddr->a4;
+ fl4->flowi4_tos = tos;
+ if (saddr)
+ fl4->saddr = saddr->a4;
+
+ rt = __ip_route_output_key(net, fl4);
+ if (!IS_ERR(rt))
+ return &rt->dst;
+
+ return ERR_CAST(rt);
+}
+
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct flowi4 fl4;
+
+ return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
+}
+
+static int xfrm4_get_saddr(struct net *net,
+ xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
+ if (IS_ERR(dst))
+ return -EHOSTUNREACH;
+
+ saddr->a4 = fl4.saddr;
+ dst_release(dst);
+ return 0;
+}
+
+static int xfrm4_get_tos(const struct flowi *fl)
+{
+ return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
+}
+
+static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+ int nfheader_len)
+{
+ return 0;
+}
+
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ const struct flowi *fl)
+{
+ struct rtable *rt = (struct rtable *)xdst->route;
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ xdst->u.rt.rt_iif = fl4->flowi4_iif;
+
+ xdst->u.dst.dev = dev;
+ dev_hold(dev);
+
+ /* Sheit... I remember I did this right. Apparently,
+ * it was magically lost, so this code needs audit */
+ xdst->u.rt.rt_is_input = rt->rt_is_input;
+ xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
+ RTCF_LOCAL);
+ xdst->u.rt.rt_type = rt->rt_type;
+ xdst->u.rt.rt_gateway = rt->rt_gateway;
+ xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+ xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
+
+ return 0;
+}
+
+static void
+_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
+
+ memset(fl4, 0, sizeof(struct flowi4));
+ fl4->flowi4_mark = skb->mark;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
+
+ if (!ip_is_fragment(iph)) {
+ switch (iph->protocol) {
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ports = (__be16 *)xprth;
+
+ fl4->fl4_sport = ports[!!reverse];
+ fl4->fl4_dport = ports[!reverse];
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp = xprth;
+
+ fl4->fl4_icmp_type = icmp[0];
+ fl4->fl4_icmp_code = icmp[1];
+ }
+ break;
+
+ case IPPROTO_ESP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be32 *ehdr = (__be32 *)xprth;
+
+ fl4->fl4_ipsec_spi = ehdr[0];
+ }
+ break;
+
+ case IPPROTO_AH:
+ if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ __be32 *ah_hdr = (__be32 *)xprth;
+
+ fl4->fl4_ipsec_spi = ah_hdr[1];
+ }
+ break;
+
+ case IPPROTO_COMP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ipcomp_hdr = (__be16 *)xprth;
+
+ fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+ }
+ break;
+
+ case IPPROTO_GRE:
+ if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
+ __be16 *greflags = (__be16 *)xprth;
+ __be32 *gre_hdr = (__be32 *)xprth;
+
+ if (greflags[0] & GRE_KEY) {
+ if (greflags[0] & GRE_CSUM)
+ gre_hdr++;
+ fl4->fl4_gre_key = gre_hdr[1];
+ }
+ }
+ break;
+
+ default:
+ fl4->fl4_ipsec_spi = 0;
+ break;
+ }
+ }
+ fl4->flowi4_proto = iph->protocol;
+ fl4->daddr = reverse ? iph->saddr : iph->daddr;
+ fl4->saddr = reverse ? iph->daddr : iph->saddr;
+ fl4->flowi4_tos = iph->tos;
+}
+
+static inline int xfrm4_garbage_collect(struct dst_ops *ops)
+{
+ struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
+
+ xfrm4_policy_afinfo.garbage_collect(net);
+ return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
+}
+
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, sk, skb, mtu);
+}
+
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->redirect(path, sk, skb);
+}
+
+static void xfrm4_dst_destroy(struct dst_entry *dst)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+ dst_destroy_metrics_generic(dst);
+
+ xfrm_dst_destroy(xdst);
+}
+
+static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int unregister)
+{
+ if (!unregister)
+ return;
+
+ xfrm_dst_ifdown(dst, dev);
+}
+
+static struct dst_ops xfrm4_dst_ops = {
+ .family = AF_INET,
+ .gc = xfrm4_garbage_collect,
+ .update_pmtu = xfrm4_update_pmtu,
+ .redirect = xfrm4_redirect,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = xfrm4_dst_destroy,
+ .ifdown = xfrm4_dst_ifdown,
+ .local_out = __ip_local_out,
+ .gc_thresh = 32768,
+};
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+ .family = AF_INET,
+ .dst_ops = &xfrm4_dst_ops,
+ .dst_lookup = xfrm4_dst_lookup,
+ .get_saddr = xfrm4_get_saddr,
+ .decode_session = _decode_session4,
+ .get_tos = xfrm4_get_tos,
+ .init_path = xfrm4_init_path,
+ .fill_dst = xfrm4_fill_dst,
+ .blackhole_route = ipv4_blackhole_route,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm4_policy_table[] = {
+ {
+ .procname = "xfrm4_gc_thresh",
+ .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = xfrm4_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.xfrm4_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ if (!net->ipv4.xfrm4_hdr)
+ return;
+
+ table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+ .init = xfrm4_net_init,
+ .exit = xfrm4_net_exit,
+};
+#endif
+
+static void __init xfrm4_policy_init(void)
+{
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+}
+
+void __init xfrm4_init(void)
+{
+ dst_entries_init(&xfrm4_dst_ops);
+
+ xfrm4_state_init();
+ xfrm4_policy_init();
+ xfrm4_protocol_init();
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&xfrm4_net_ops);
+#endif
+}
+
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 000000000..dccefa9d8
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,301 @@
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_handlers;
+ case IPPROTO_AH:
+ return &ah4_handlers;
+ case IPPROTO_COMP:
+ return &ipcomp4_handlers;
+ }
+
+ return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
+
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->cb_handler(skb, err)) <= 0)
+ return ret;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_cb);
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ if (!head)
+ goto out;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static const struct net_protocol esp4_protocol = {
+ .handler = xfrm4_esp_rcv,
+ .err_handler = xfrm4_esp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ah4_protocol = {
+ .handler = xfrm4_ah_rcv,
+ .err_handler = xfrm4_ah_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_ipcomp_rcv,
+ .err_handler = xfrm4_ipcomp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+ .family = AF_INET,
+ .owner = THIS_MODULE,
+ .callback = xfrm4_rcv_cb,
+};
+
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_protocol;
+ case IPPROTO_AH:
+ return &ah4_protocol;
+ case IPPROTO_COMP:
+ return &ipcomp4_protocol;
+ }
+
+ return NULL;
+}
+
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ bool add_netproto = false;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex)))
+ add_netproto = true;
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority < priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ if (add_netproto) {
+ if (inet_add_protocol(netproto(protocol), protocol)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ int ret = -ENOENT;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex))) {
+ if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+ pr_err("%s: can't remove protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+ xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
+EXPORT_SYMBOL(xfrm4_protocol_init);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 000000000..542074c00
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,100 @@
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/export.h>
+
+static int xfrm4_init_flags(struct xfrm_state *x)
+{
+ if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
+ x->props.flags |= XFRM_STATE_NOPMTUDISC;
+ return 0;
+}
+
+static void
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ sel->daddr.a4 = fl4->daddr;
+ sel->saddr.a4 = fl4->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET;
+ sel->prefixlen_d = 32;
+ sel->prefixlen_s = 32;
+ sel->proto = fl4->flowi4_proto;
+ sel->ifindex = fl4->flowi4_oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr)
+{
+ x->id = tmpl->id;
+ if (x->id.daddr.a4 == 0)
+ x->id.daddr.a4 = daddr->a4;
+ x->props.saddr = tmpl->saddr;
+ if (x->props.saddr.a4 == 0)
+ x->props.saddr.a4 = saddr->a4;
+ x->props.mode = tmpl->mode;
+ x->props.reqid = tmpl->reqid;
+ x->props.family = AF_INET;
+}
+
+int xfrm4_extract_header(struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+ XFRM_MODE_SKB_CB(skb)->id = iph->id;
+ XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
+ XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
+ XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
+ XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
+ memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
+ sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+
+ return 0;
+}
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+ .family = AF_INET,
+ .proto = IPPROTO_IPIP,
+ .eth_proto = htons(ETH_P_IP),
+ .owner = THIS_MODULE,
+ .init_flags = xfrm4_init_flags,
+ .init_tempsel = __xfrm4_init_tempsel,
+ .init_temprop = xfrm4_init_temprop,
+ .output = xfrm4_output,
+ .output_finish = xfrm4_output_finish,
+ .extract_input = xfrm4_extract_input,
+ .extract_output = xfrm4_extract_output,
+ .transport_finish = xfrm4_transport_finish,
+ .local_error = xfrm4_local_error,
+};
+
+void __init xfrm4_state_init(void)
+{
+ xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
+
+#if 0
+void __exit xfrm4_state_fini(void)
+{
+ xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
+}
+#endif /* 0 */
+
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 000000000..06347dbd3
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,117 @@
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ skb_push(skb, -skb_network_offset(skb));
+ return 0;
+}
+
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return ip_hdr(skb)->protocol;
+}
+
+static int ipip_init_state(struct xfrm_state *x)
+{
+ if (x->props.mode != XFRM_MODE_TUNNEL)
+ return -EINVAL;
+
+ if (x->encap)
+ return -EINVAL;
+
+ x->props.header_len = sizeof(struct iphdr);
+
+ return 0;
+}
+
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type ipip_type = {
+ .description = "IPIP",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_IPIP,
+ .init_state = ipip_init_state,
+ .destructor = ipip_destroy,
+ .input = ipip_xfrm_rcv,
+ .output = ipip_output
+};
+
+static int xfrm_tunnel_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
+}
+
+static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
+{
+ return -ENOENT;
+}
+
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
+ .handler = xfrm_tunnel_rcv,
+ .err_handler = xfrm_tunnel_err,
+ .priority = 3,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
+ .handler = xfrm_tunnel_rcv,
+ .err_handler = xfrm_tunnel_err,
+ .priority = 2,
+};
+#endif
+
+static int __init ipip_init(void)
+{
+ if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+
+ if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
+ pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
+ xfrm_unregister_type(&ipip_type, AF_INET);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
+ pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
+ xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
+ xfrm_unregister_type(&ipip_type, AF_INET);
+ return -EAGAIN;
+ }
+#endif
+ return 0;
+}
+
+static void __exit ipip_fini(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
+ pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+ __func__);
+#endif
+ if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
+ pr_info("%s: can't remove xfrm handler for AF_INET\n",
+ __func__);
+ if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);