summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h9
-rw-r--r--net/6lowpan/core.c8
-rw-r--r--net/6lowpan/debugfs.c22
-rw-r--r--net/6lowpan/iphc.c122
-rw-r--r--net/6lowpan/nhc_udp.c2
-rw-r--r--net/8021q/vlan.c5
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c30
-rw-r--r--net/8021q/vlan_netlink.c7
-rw-r--r--net/9p/client.c8
-rw-r--r--net/Kconfig22
-rw-r--r--net/Makefile1
-rw-r--r--net/atm/lec.c4
-rw-r--r--net/atm/signaling.c2
-rw-r--r--net/atm/svc.c4
-rw-r--r--net/batman-adv/bat_iv_ogm.c178
-rw-r--r--net/batman-adv/bat_v.c75
-rw-r--r--net/batman-adv/bat_v_elp.c31
-rw-r--r--net/batman-adv/bat_v_elp.h2
-rw-r--r--net/batman-adv/bat_v_ogm.c221
-rw-r--r--net/batman-adv/bitarray.c16
-rw-r--r--net/batman-adv/bitarray.h15
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c449
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h43
-rw-r--r--net/batman-adv/debugfs.c21
-rw-r--r--net/batman-adv/distributed-arp-table.c24
-rw-r--r--net/batman-adv/fragmentation.c12
-rw-r--r--net/batman-adv/gateway_client.c12
-rw-r--r--net/batman-adv/hard-interface.c34
-rw-r--r--net/batman-adv/hard-interface.h3
-rw-r--r--net/batman-adv/hash.h6
-rw-r--r--net/batman-adv/icmp_socket.c24
-rw-r--r--net/batman-adv/main.c26
-rw-r--r--net/batman-adv/main.h11
-rw-r--r--net/batman-adv/multicast.c11
-rw-r--r--net/batman-adv/network-coding.c45
-rw-r--r--net/batman-adv/originator.c64
-rw-r--r--net/batman-adv/originator.h2
-rw-r--r--net/batman-adv/packet.h3
-rw-r--r--net/batman-adv/routing.c107
-rw-r--r--net/batman-adv/routing.h6
-rw-r--r--net/batman-adv/send.c14
-rw-r--r--net/batman-adv/soft-interface.c61
-rw-r--r--net/batman-adv/soft-interface.h10
-rw-r--r--net/batman-adv/sysfs.c9
-rw-r--r--net/batman-adv/translation-table.c96
-rw-r--r--net/batman-adv/types.h16
-rw-r--r--net/bluetooth/6lowpan.c82
-rw-r--r--net/bluetooth/af_bluetooth.c2
-rw-r--r--net/bluetooth/bnep/netdev.c2
-rw-r--r--net/bluetooth/hci_core.c4
-rw-r--r--net/bluetooth/hci_event.c13
-rw-r--r--net/bluetooth/hci_request.c6
-rw-r--r--net/bluetooth/l2cap_sock.c2
-rw-r--r--net/bridge/br_input.c23
-rw-r--r--net/bridge/br_ioctl.c40
-rw-r--r--net/bridge/br_netfilter_hooks.c8
-rw-r--r--net/bridge/br_netfilter_ipv6.c10
-rw-r--r--net/bridge/br_netlink.c140
-rw-r--r--net/bridge/br_private.h18
-rw-r--r--net/bridge/br_sysfs_br.c101
-rw-r--r--net/bridge/br_sysfs_if.c5
-rw-r--r--net/bridge/br_vlan.c135
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c47
-rw-r--r--net/can/raw.c2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_strings.c16
-rw-r--r--net/ceph/debugfs.c147
-rw-r--r--net/ceph/mon_client.c393
-rw-r--r--net/ceph/osd_client.c4031
-rw-r--r--net/ceph/osdmap.c807
-rw-r--r--net/compat.c20
-rw-r--r--net/core/datagram.c9
-rw-r--r--net/core/dev.c112
-rw-r--r--net/core/devlink.c1050
-rw-r--r--net/core/ethtool.c29
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c211
-rw-r--r--net/core/gen_stats.c37
-rw-r--r--net/core/neighbour.c22
-rw-r--r--net/core/net-procfs.c3
-rw-r--r--net/core/net-sysfs.c1
-rw-r--r--net/core/pktgen.c9
-rw-r--r--net/core/rtnetlink.c264
-rw-r--r--net/core/skbuff.c272
-rw-r--r--net/core/sock.c161
-rw-r--r--net/core/sock_diag.c3
-rw-r--r--net/core/sysctl_net_core.c9
-rw-r--r--net/dccp/dccp.h6
-rw-r--r--net/dccp/input.c2
-rw-r--r--net/dccp/ipv4.c39
-rw-r--r--net/dccp/ipv6.c35
-rw-r--r--net/dccp/minisocks.c2
-rw-r--r--net/dccp/options.c2
-rw-r--r--net/dccp/timer.c8
-rw-r--r--net/decnet/dn_fib.c21
-rw-r--r--net/dns_resolver/dns_key.c2
-rw-r--r--net/dsa/dsa.c62
-rw-r--r--net/dsa/dsa_priv.h5
-rw-r--r--net/dsa/slave.c168
-rw-r--r--net/hsr/Kconfig5
-rw-r--r--net/hsr/hsr_device.c91
-rw-r--r--net/hsr/hsr_device.h2
-rw-r--r--net/hsr/hsr_forward.c43
-rw-r--r--net/hsr/hsr_framereg.c30
-rw-r--r--net/hsr/hsr_main.h13
-rw-r--r--net/hsr/hsr_netlink.c17
-rw-r--r--net/hsr/hsr_slave.c4
-rw-r--r--net/ieee802154/6lowpan/6lowpan_i.h14
-rw-r--r--net/ieee802154/6lowpan/core.c6
-rw-r--r--net/ieee802154/6lowpan/tx.c14
-rw-r--r--net/ieee802154/nl-mac.c17
-rw-r--r--net/ieee802154/nl802154.c30
-rw-r--r--net/ipv4/af_inet.c101
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_semantics.c40
-rw-r--r--net/ipv4/fou.c196
-rw-r--r--net/ipv4/gre_demux.c61
-rw-r--r--net/ipv4/gre_offload.c49
-rw-r--r--net/ipv4/icmp.c18
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/inet_diag.c98
-rw-r--r--net/ipv4/inet_hashtables.c86
-rw-r--r--net/ipv4/inet_timewait_sock.c10
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c14
-rw-r--r--net/ipv4/ip_gre.c281
-rw-r--r--net/ipv4/ip_input.c42
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_sockglue.c34
-rw-r--r--net/ipv4/ip_tunnel.c45
-rw-r--r--net/ipv4/ip_tunnel_core.c50
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/arp_tables.c233
-rw-r--r--net/ipv4/netfilter/ip_tables.c259
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c47
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c13
-rw-r--r--net/ipv4/route.c10
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c65
-rw-r--r--net/ipv4/tcp_bic.c6
-rw-r--r--net/ipv4/tcp_cdg.c34
-rw-r--r--net/ipv4/tcp_cubic.c26
-rw-r--r--net/ipv4/tcp_fastopen.c14
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_illinois.c21
-rw-r--r--net/ipv4/tcp_input.c300
-rw-r--r--net/ipv4/tcp_ipv4.c144
-rw-r--r--net/ipv4/tcp_lp.c6
-rw-r--r--net/ipv4/tcp_metrics.c6
-rw-r--r--net/ipv4/tcp_minisocks.c19
-rw-r--r--net/ipv4/tcp_offload.c43
-rw-r--r--net/ipv4/tcp_output.c175
-rw-r--r--net/ipv4/tcp_recovery.c4
-rw-r--r--net/ipv4/tcp_timer.c28
-rw-r--r--net/ipv4/tcp_vegas.c6
-rw-r--r--net/ipv4/tcp_vegas.h2
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_westwood.c7
-rw-r--r--net/ipv4/tcp_yeah.c7
-rw-r--r--net/ipv4/udp.c504
-rw-r--r--net/ipv4/udp_diag.c18
-rw-r--r--net/ipv4/udp_offload.c150
-rw-r--r--net/ipv4/udp_tunnel.c2
-rw-r--r--net/ipv6/Kconfig10
-rw-r--r--net/ipv6/Makefile6
-rw-r--r--net/ipv6/addrconf.c671
-rw-r--r--net/ipv6/af_inet6.c9
-rw-r--r--net/ipv6/datagram.c29
-rw-r--r--net/ipv6/exthdrs.c66
-rw-r--r--net/ipv6/fou6.c140
-rw-r--r--net/ipv6/icmp.c44
-rw-r--r--net/ipv6/ila/ila.h79
-rw-r--r--net/ipv6/ila/ila_common.c81
-rw-r--r--net/ipv6/ila/ila_lwt.c54
-rw-r--r--net/ipv6/ila/ila_xlat.c167
-rw-r--r--net/ipv6/inet6_hashtables.c64
-rw-r--r--net/ipv6/ip6_checksum.c7
-rw-r--r--net/ipv6/ip6_fib.c1
-rw-r--r--net/ipv6/ip6_flowlabel.c7
-rw-r--r--net/ipv6/ip6_gre.c531
-rw-r--r--net/ipv6/ip6_input.c72
-rw-r--r--net/ipv6/ip6_offload.c94
-rw-r--r--net/ipv6/ip6_offload.h3
-rw-r--r--net/ipv6/ip6_output.c79
-rw-r--r--net/ipv6/ip6_tunnel.c452
-rw-r--r--net/ipv6/ip6mr.c12
-rw-r--r--net/ipv6/ipv6_sockglue.c13
-rw-r--r--net/ipv6/netfilter/ip6_tables.c244
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c58
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c1
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c2
-rw-r--r--net/ipv6/ping.c13
-rw-r--r--net/ipv6/raw.c34
-rw-r--r--net/ipv6/reassembly.c32
-rw-r--r--net/ipv6/route.c42
-rw-r--r--net/ipv6/sit.c14
-rw-r--r--net/ipv6/syncookies.c4
-rw-r--r--net/ipv6/tcp_ipv6.c82
-rw-r--r--net/ipv6/udp.c437
-rw-r--r--net/ipv6/udp_offload.c24
-rw-r--r--net/irda/af_irda.c7
-rw-r--r--net/irda/ircomm/ircomm_tty.c31
-rw-r--r--net/irda/ircomm/ircomm_tty_attach.c2
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c18
-rw-r--r--net/irda/irlan/irlan_eth.c2
-rw-r--r--net/kcm/kcmsock.c2
-rw-r--r--net/l2tp/l2tp_ip6.c43
-rw-r--r--net/l2tp/l2tp_netlink.c83
-rw-r--r--net/l3mdev/l3mdev.c63
-rw-r--r--net/lapb/lapb_in.c5
-rw-r--r--net/lapb/lapb_out.c4
-rw-r--r--net/lapb/lapb_subr.c14
-rw-r--r--net/llc/llc_proc.c2
-rw-r--r--net/mac80211/agg-tx.c5
-rw-r--r--net/mac80211/cfg.c32
-rw-r--r--net/mac80211/debugfs.c3
-rw-r--r--net/mac80211/debugfs_netdev.c12
-rw-r--r--net/mac80211/debugfs_sta.c134
-rw-r--r--net/mac80211/driver-ops.h15
-rw-r--r--net/mac80211/ibss.c27
-rw-r--r--net/mac80211/ieee80211_i.h45
-rw-r--r--net/mac80211/iface.c4
-rw-r--r--net/mac80211/key.c1
-rw-r--r--net/mac80211/main.c10
-rw-r--r--net/mac80211/mesh.c48
-rw-r--r--net/mac80211/mesh.h71
-rw-r--r--net/mac80211/mesh_hwmp.c4
-rw-r--r--net/mac80211/mesh_pathtbl.c965
-rw-r--r--net/mac80211/mesh_plink.c23
-rw-r--r--net/mac80211/mlme.c35
-rw-r--r--net/mac80211/ocb.c2
-rw-r--r--net/mac80211/rate.c2
-rw-r--r--net/mac80211/rate.h4
-rw-r--r--net/mac80211/rc80211_minstrel.c6
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c81
-rw-r--r--net/mac80211/rx.c555
-rw-r--r--net/mac80211/scan.c20
-rw-r--r--net/mac80211/spectmgmt.c4
-rw-r--r--net/mac80211/sta_info.c289
-rw-r--r--net/mac80211/sta_info.h130
-rw-r--r--net/mac80211/status.c4
-rw-r--r--net/mac80211/tdls.c18
-rw-r--r--net/mac80211/trace.h18
-rw-r--r--net/mac80211/tx.c211
-rw-r--r--net/mac80211/util.c29
-rw-r--r--net/mac80211/vht.c4
-rw-r--r--net/mac80211/wpa.c42
-rw-r--r--net/mpls/mpls_gso.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c56
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c165
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c82
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c15
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c23
-rw-r--r--net/netfilter/nf_conntrack_core.c442
-rw-r--r--net/netfilter/nf_conntrack_ecache.c84
-rw-r--r--net/netfilter/nf_conntrack_expect.c83
-rw-r--r--net/netfilter/nf_conntrack_ftp.c1
-rw-r--r--net/netfilter/nf_conntrack_helper.c21
-rw-r--r--net/netfilter/nf_conntrack_irc.c1
-rw-r--r--net/netfilter/nf_conntrack_labels.c44
-rw-r--r--net/netfilter/nf_conntrack_netlink.c166
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c8
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c8
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c2
-rw-r--r--net/netfilter/nf_conntrack_sane.c1
-rw-r--r--net/netfilter/nf_conntrack_sip.c1
-rw-r--r--net/netfilter/nf_conntrack_standalone.c15
-rw-r--r--net/netfilter/nf_conntrack_tftp.c1
-rw-r--r--net/netfilter/nf_nat_core.c39
-rw-r--r--net/netfilter/nf_queue.c17
-rw-r--r--net/netfilter/nf_tables_api.c132
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nf_tables_trace.c5
-rw-r--r--net/netfilter/nfnetlink_acct.c9
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c6
-rw-r--r--net/netfilter/nfnetlink_queue.c125
-rw-r--r--net/netfilter/nft_counter.c6
-rw-r--r--net/netfilter/nft_ct.c38
-rw-r--r--net/netfilter/nft_dynset.c3
-rw-r--r--net/netfilter/nft_hash.c7
-rw-r--r--net/netfilter/nft_limit.c6
-rw-r--r--net/netfilter/nft_meta.c2
-rw-r--r--net/netfilter/nft_rbtree.c52
-rw-r--r--net/netfilter/xt_connlabel.c14
-rw-r--r--net/netfilter/xt_socket.c6
-rw-r--r--net/netlabel/netlabel_kapi.c2
-rw-r--r--net/netlink/af_netlink.c3
-rw-r--r--net/nfc/nci/core.c117
-rw-r--r--net/nfc/nci/ntf.c2
-rw-r--r--net/nfc/nci/rsp.c23
-rw-r--r--net/openvswitch/actions.c20
-rw-r--r--net/openvswitch/conntrack.c24
-rw-r--r--net/openvswitch/datapath.c30
-rw-r--r--net/openvswitch/flow_netlink.c5
-rw-r--r--net/openvswitch/vport-internal_dev.c3
-rw-r--r--net/packet/af_packet.c67
-rw-r--r--net/qrtr/Kconfig24
-rw-r--r--net/qrtr/Makefile4
-rw-r--r--net/qrtr/qrtr.c1007
-rw-r--r--net/qrtr/qrtr.h31
-rw-r--r--net/qrtr/smd.c120
-rw-r--r--net/rds/ib_cm.c2
-rw-r--r--net/rds/ib_frmr.c2
-rw-r--r--net/rds/loop.c5
-rw-r--r--net/rds/rds.h2
-rw-r--r--net/rds/recv.c2
-rw-r--r--net/rds/send.c1
-rw-r--r--net/rds/sysctl.c3
-rw-r--r--net/rds/tcp.c83
-rw-r--r--net/rds/tcp.h3
-rw-r--r--net/rds/tcp_connect.c30
-rw-r--r--net/rds/tcp_listen.c31
-rw-r--r--net/rds/tcp_recv.c20
-rw-r--r--net/rds/tcp_send.c18
-rw-r--r--net/rds/threads.c10
-rw-r--r--net/rds/transport.c3
-rw-r--r--net/rfkill/core.c36
-rw-r--r--net/rose/rose_in.c3
-rw-r--r--net/rxrpc/Kconfig2
-rw-r--r--net/rxrpc/Makefile7
-rw-r--r--net/rxrpc/af_rxrpc.c9
-rw-r--r--net/rxrpc/ar-accept.c4
-rw-r--r--net/rxrpc/ar-ack.c81
-rw-r--r--net/rxrpc/ar-call.c11
-rw-r--r--net/rxrpc/ar-connection.c14
-rw-r--r--net/rxrpc/ar-connevent.c27
-rw-r--r--net/rxrpc/ar-input.c22
-rw-r--r--net/rxrpc/ar-internal.h70
-rw-r--r--net/rxrpc/ar-key.c4
-rw-r--r--net/rxrpc/ar-output.c6
-rw-r--r--net/rxrpc/ar-proc.c2
-rw-r--r--net/rxrpc/ar-recvmsg.c18
-rw-r--r--net/rxrpc/ar-security.c166
-rw-r--r--net/rxrpc/insecure.c83
-rw-r--r--net/rxrpc/misc.c89
-rw-r--r--net/rxrpc/rxkad.c59
-rw-r--r--net/sched/act_api.c9
-rw-r--r--net/sched/act_bpf.c5
-rw-r--r--net/sched/act_connmark.c3
-rw-r--r--net/sched/act_csum.c2
-rw-r--r--net/sched/act_gact.c17
-rw-r--r--net/sched/act_ife.c55
-rw-r--r--net/sched/act_ipt.c2
-rw-r--r--net/sched/act_mirred.c7
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c2
-rw-r--r--net/sched/act_police.c40
-rw-r--r--net/sched/act_simple.c2
-rw-r--r--net/sched/act_skbedit.c2
-rw-r--r--net/sched/act_vlan.c2
-rw-r--r--net/sched/cls_bpf.c2
-rw-r--r--net/sched/cls_flower.c25
-rw-r--r--net/sched/cls_u32.c104
-rw-r--r--net/sched/em_meta.c8
-rw-r--r--net/sched/sch_api.c10
-rw-r--r--net/sched/sch_codel.c22
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_fq_codel.c123
-rw-r--r--net/sched/sch_generic.c46
-rw-r--r--net/sched/sch_hfsc.c12
-rw-r--r--net/sched/sch_htb.c27
-rw-r--r--net/sched/sch_ingress.c12
-rw-r--r--net/sched/sch_netem.c3
-rw-r--r--net/sched/sch_prio.c71
-rw-r--r--net/sched/sch_qfq.c6
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_tbf.c10
-rw-r--r--net/sctp/Kconfig4
-rw-r--r--net/sctp/Makefile1
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/input.c23
-rw-r--r--net/sctp/inqueue.c3
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/proc.c104
-rw-r--r--net/sctp/sctp_diag.c497
-rw-r--r--net/sctp/sm_sideeffect.c6
-rw-r--r--net/sctp/socket.c225
-rw-r--r--net/sctp/ulpqueue.c26
-rw-r--r--net/socket.c46
-rw-r--r--net/sunrpc/auth.c9
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c6
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c5
-rw-r--r--net/sunrpc/auth_unix.c6
-rw-r--r--net/sunrpc/clnt.c17
-rw-r--r--net/sunrpc/socklib.c2
-rw-r--r--net/sunrpc/svc_xprt.c23
-rw-r--r--net/sunrpc/svcsock.c8
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c16
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c134
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c214
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c39
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c517
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c32
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c36
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c28
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c17
-rw-r--r--net/sunrpc/xprtrdma/transport.c16
-rw-r--r--net/sunrpc/xprtrdma/verbs.c78
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h47
-rw-r--r--net/sunrpc/xprtsock.c21
-rw-r--r--net/switchdev/Kconfig2
-rw-r--r--net/tipc/bearer.c116
-rw-r--r--net/tipc/bearer.h16
-rw-r--r--net/tipc/core.c8
-rw-r--r--net/tipc/discover.c7
-rw-r--r--net/tipc/discover.h2
-rw-r--r--net/tipc/link.c60
-rw-r--r--net/tipc/link.h2
-rw-r--r--net/tipc/msg.c6
-rw-r--r--net/tipc/msg.h40
-rw-r--r--net/tipc/netlink_compat.c114
-rw-r--r--net/tipc/node.c48
-rw-r--r--net/tipc/node.h6
-rw-r--r--net/tipc/server.c27
-rw-r--r--net/tipc/server.h4
-rw-r--r--net/tipc/socket.c200
-rw-r--r--net/tipc/socket.h17
-rw-r--r--net/tipc/subscr.c7
-rw-r--r--net/unix/af_unix.c2
-rw-r--r--net/vmw_vsock/af_vsock.c12
-rw-r--r--net/vmw_vsock/vmci_transport.c2
-rw-r--r--net/wireless/chan.c4
-rw-r--r--net/wireless/core.c32
-rw-r--r--net/wireless/core.h6
-rw-r--r--net/wireless/debugfs.c4
-rw-r--r--net/wireless/ibss.c6
-rw-r--r--net/wireless/mesh.c4
-rw-r--r--net/wireless/mlme.c2
-rw-r--r--net/wireless/nl80211.c348
-rw-r--r--net/wireless/rdev-ops.h2
-rw-r--r--net/wireless/reg.c32
-rw-r--r--net/wireless/reg.h2
-rw-r--r--net/wireless/scan.c24
-rw-r--r--net/wireless/sme.c54
-rw-r--r--net/wireless/sysfs.c2
-rw-r--r--net/wireless/trace.h26
-rw-r--r--net/wireless/util.c50
-rw-r--r--net/wireless/wext-compat.c45
-rw-r--r--net/wireless/wext-core.c5
-rw-r--r--net/xfrm/xfrm_user.c10
454 files changed, 18944 insertions, 10347 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
index d16bb4b14..97ecc27ae 100644
--- a/net/6lowpan/6lowpan_i.h
+++ b/net/6lowpan/6lowpan_i.h
@@ -3,6 +3,15 @@
#include <linux/netdevice.h>
+#include <net/6lowpan.h>
+
+/* caller need to be sure it's dev->type is ARPHRD_6LOWPAN */
+static inline bool lowpan_is_ll(const struct net_device *dev,
+ enum lowpan_lltypes lltype)
+{
+ return lowpan_dev(dev)->lltype == lltype;
+}
+
#ifdef CONFIG_6LOWPAN_DEBUGFS
int lowpan_dev_debugfs_init(struct net_device *dev);
void lowpan_dev_debugfs_exit(struct net_device *dev);
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 34e44c0c0..7a240b3ea 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -27,11 +27,11 @@ int lowpan_register_netdevice(struct net_device *dev,
dev->mtu = IPV6_MIN_MTU;
dev->priv_flags |= IFF_NO_QUEUE;
- lowpan_priv(dev)->lltype = lltype;
+ lowpan_dev(dev)->lltype = lltype;
- spin_lock_init(&lowpan_priv(dev)->ctx.lock);
+ spin_lock_init(&lowpan_dev(dev)->ctx.lock);
for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
- lowpan_priv(dev)->ctx.table[i].id = i;
+ lowpan_dev(dev)->ctx.table[i].id = i;
ret = register_netdevice(dev);
if (ret < 0)
@@ -85,7 +85,7 @@ static int lowpan_event(struct notifier_block *unused,
case NETDEV_DOWN:
for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
- &lowpan_priv(dev)->ctx.table[i].flags);
+ &lowpan_dev(dev)->ctx.table[i].flags);
break;
default:
return NOTIFY_DONE;
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index 0793a8157..acbaa3db4 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -172,7 +172,7 @@ static const struct file_operations lowpan_ctx_pfx_fops = {
static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
struct dentry *ctx, u8 id)
{
- struct lowpan_priv *lpriv = lowpan_priv(dev);
+ struct lowpan_dev *ldev = lowpan_dev(dev);
struct dentry *dentry, *root;
char buf[32];
@@ -185,25 +185,25 @@ static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
return -EINVAL;
dentry = debugfs_create_file("active", 0644, root,
- &lpriv->ctx.table[id],
+ &ldev->ctx.table[id],
&lowpan_ctx_flag_active_fops);
if (!dentry)
return -EINVAL;
dentry = debugfs_create_file("compression", 0644, root,
- &lpriv->ctx.table[id],
+ &ldev->ctx.table[id],
&lowpan_ctx_flag_c_fops);
if (!dentry)
return -EINVAL;
dentry = debugfs_create_file("prefix", 0644, root,
- &lpriv->ctx.table[id],
+ &ldev->ctx.table[id],
&lowpan_ctx_pfx_fops);
if (!dentry)
return -EINVAL;
dentry = debugfs_create_file("prefix_len", 0644, root,
- &lpriv->ctx.table[id],
+ &ldev->ctx.table[id],
&lowpan_ctx_plen_fops);
if (!dentry)
return -EINVAL;
@@ -247,21 +247,21 @@ static const struct file_operations lowpan_context_fops = {
int lowpan_dev_debugfs_init(struct net_device *dev)
{
- struct lowpan_priv *lpriv = lowpan_priv(dev);
+ struct lowpan_dev *ldev = lowpan_dev(dev);
struct dentry *contexts, *dentry;
int ret, i;
/* creating the root */
- lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
- if (!lpriv->iface_debugfs)
+ ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
+ if (!ldev->iface_debugfs)
goto fail;
- contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs);
+ contexts = debugfs_create_dir("contexts", ldev->iface_debugfs);
if (!contexts)
goto remove_root;
dentry = debugfs_create_file("show", 0644, contexts,
- &lowpan_priv(dev)->ctx,
+ &lowpan_dev(dev)->ctx,
&lowpan_context_fops);
if (!dentry)
goto remove_root;
@@ -282,7 +282,7 @@ fail:
void lowpan_dev_debugfs_exit(struct net_device *dev)
{
- debugfs_remove_recursive(lowpan_priv(dev)->iface_debugfs);
+ debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs);
}
int __init lowpan_debugfs_init(void)
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 99bb22aea..8501dd532 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -53,9 +53,6 @@
#include <net/6lowpan.h>
#include <net/ipv6.h>
-/* special link-layer handling */
-#include <net/mac802154.h>
-
#include "6lowpan_i.h"
#include "nhc.h"
@@ -148,35 +145,25 @@
(((a)->s6_addr16[6]) == 0) && \
(((a)->s6_addr[14]) == 0))
+#define lowpan_is_linklocal_zero_padded(a) \
+ (!(hdr->saddr.s6_addr[1] & 0x3f) && \
+ !hdr->saddr.s6_addr16[1] && \
+ !hdr->saddr.s6_addr32[1])
+
#define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f)
#define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4)
-static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr,
- const void *lladdr)
-{
- /* fe:80::XXXX:XXXX:XXXX:XXXX
- * \_________________/
- * hwaddr
- */
- ipaddr->s6_addr[0] = 0xFE;
- ipaddr->s6_addr[1] = 0x80;
- memcpy(&ipaddr->s6_addr[8], lladdr, EUI64_ADDR_LEN);
- /* second bit-flip (Universe/Local)
- * is done according RFC2464
- */
- ipaddr->s6_addr[8] ^= 0x02;
-}
-
-static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
- const void *lladdr)
+static inline void
+lowpan_iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
+ const void *lladdr)
{
const struct ieee802154_addr *addr = lladdr;
- u8 eui64[EUI64_ADDR_LEN] = { };
+ u8 eui64[EUI64_ADDR_LEN];
switch (addr->mode) {
case IEEE802154_ADDR_LONG:
ieee802154_le64_to_be64(eui64, &addr->extended_addr);
- iphc_uncompress_eui64_lladdr(ipaddr, eui64);
+ lowpan_iphc_uncompress_eui64_lladdr(ipaddr, eui64);
break;
case IEEE802154_ADDR_SHORT:
/* fe:80::ff:fe00:XXXX
@@ -202,7 +189,7 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
static struct lowpan_iphc_ctx *
lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id)
{
- struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id];
+ struct lowpan_iphc_ctx *ret = &lowpan_dev(dev)->ctx.table[id];
if (!lowpan_iphc_ctx_is_active(ret))
return NULL;
@@ -214,7 +201,7 @@ static struct lowpan_iphc_ctx *
lowpan_iphc_ctx_get_by_addr(const struct net_device *dev,
const struct in6_addr *addr)
{
- struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+ struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table;
struct lowpan_iphc_ctx *ret = NULL;
struct in6_addr addr_pfx;
u8 addr_plen;
@@ -258,7 +245,7 @@ static struct lowpan_iphc_ctx *
lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev,
const struct in6_addr *addr)
{
- struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+ struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table;
struct lowpan_iphc_ctx *ret = NULL;
struct in6_addr addr_mcast, network_pfx = {};
int i;
@@ -296,9 +283,10 @@ lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev,
*
* address_mode is the masked value for sam or dam value
*/
-static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
- struct in6_addr *ipaddr, u8 address_mode,
- const void *lladdr)
+static int lowpan_iphc_uncompress_addr(struct sk_buff *skb,
+ const struct net_device *dev,
+ struct in6_addr *ipaddr,
+ u8 address_mode, const void *lladdr)
{
bool fail;
@@ -327,12 +315,12 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
case LOWPAN_IPHC_SAM_11:
case LOWPAN_IPHC_DAM_11:
fail = false;
- switch (lowpan_priv(dev)->lltype) {
+ switch (lowpan_dev(dev)->lltype) {
case LOWPAN_LLTYPE_IEEE802154:
- iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+ lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr);
break;
default:
- iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+ lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
break;
}
break;
@@ -355,11 +343,11 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
/* Uncompress address function for source context
* based address(non-multicast).
*/
-static int uncompress_ctx_addr(struct sk_buff *skb,
- const struct net_device *dev,
- const struct lowpan_iphc_ctx *ctx,
- struct in6_addr *ipaddr, u8 address_mode,
- const void *lladdr)
+static int lowpan_iphc_uncompress_ctx_addr(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct lowpan_iphc_ctx *ctx,
+ struct in6_addr *ipaddr,
+ u8 address_mode, const void *lladdr)
{
bool fail;
@@ -388,12 +376,12 @@ static int uncompress_ctx_addr(struct sk_buff *skb,
case LOWPAN_IPHC_SAM_11:
case LOWPAN_IPHC_DAM_11:
fail = false;
- switch (lowpan_priv(dev)->lltype) {
+ switch (lowpan_dev(dev)->lltype) {
case LOWPAN_LLTYPE_IEEE802154:
- iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+ lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr);
break;
default:
- iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+ lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
break;
}
ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
@@ -652,22 +640,24 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
}
if (iphc1 & LOWPAN_IPHC_SAC) {
- spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid));
if (!ci) {
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
return -EINVAL;
}
pr_debug("SAC bit is set. Handle context based source address.\n");
- err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr,
- iphc1 & LOWPAN_IPHC_SAM_MASK, saddr);
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK,
+ saddr);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
} else {
/* Source address uncompression */
pr_debug("source address stateless compression\n");
- err = uncompress_addr(skb, dev, &hdr.saddr,
- iphc1 & LOWPAN_IPHC_SAM_MASK, saddr);
+ err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK,
+ saddr);
}
/* Check on error of previous branch */
@@ -676,10 +666,10 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) {
case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC:
- spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
if (!ci) {
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
return -EINVAL;
}
@@ -688,7 +678,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
err = lowpan_uncompress_multicast_ctx_daddr(skb, ci,
&hdr.daddr,
iphc1 & LOWPAN_IPHC_DAM_MASK);
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
break;
case LOWPAN_IPHC_M:
/* multicast */
@@ -696,22 +686,24 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
iphc1 & LOWPAN_IPHC_DAM_MASK);
break;
case LOWPAN_IPHC_DAC:
- spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
if (!ci) {
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
return -EINVAL;
}
/* Destination address context based uncompression */
pr_debug("DAC bit is set. Handle context based destination address.\n");
- err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr,
- iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK,
+ daddr);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
break;
default:
- err = uncompress_addr(skb, dev, &hdr.daddr,
- iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
+ err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK,
+ daddr);
pr_debug("dest: stateless compression mode %d dest %pI6c\n",
iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr);
break;
@@ -731,7 +723,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
return err;
}
- switch (lowpan_priv(dev)->lltype) {
+ switch (lowpan_dev(dev)->lltype) {
case LOWPAN_LLTYPE_IEEE802154:
if (lowpan_802154_cb(skb)->d_size)
hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size -
@@ -1028,7 +1020,7 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
skb->data, skb->len);
ipv6_daddr_type = ipv6_addr_type(&hdr->daddr);
- spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
if (ipv6_daddr_type & IPV6_ADDR_MULTICAST)
dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr);
else
@@ -1037,15 +1029,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
memcpy(&dci_entry, dci, sizeof(*dci));
cid |= dci->id;
}
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
- spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr);
if (sci) {
memcpy(&sci_entry, sci, sizeof(*sci));
cid |= (sci->id << 4);
}
- spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
/* if cid is zero it will be compressed */
if (cid) {
@@ -1101,7 +1093,8 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
true);
iphc1 |= LOWPAN_IPHC_SAC;
} else {
- if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) {
+ if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL &&
+ lowpan_is_linklocal_zero_padded(hdr->saddr)) {
iphc1 |= lowpan_compress_addr_64(&hc_ptr,
&hdr->saddr,
saddr, true);
@@ -1135,7 +1128,8 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
false);
iphc1 |= LOWPAN_IPHC_DAC;
} else {
- if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) {
+ if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL &&
+ lowpan_is_linklocal_zero_padded(hdr->daddr)) {
iphc1 |= lowpan_compress_addr_64(&hc_ptr,
&hdr->daddr,
daddr, false);
diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c
index 69537a2ea..225d91906 100644
--- a/net/6lowpan/nhc_udp.c
+++ b/net/6lowpan/nhc_udp.c
@@ -91,7 +91,7 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed)
* here, we obtain the hint from the remaining size of the
* frame
*/
- switch (lowpan_priv(skb->dev)->lltype) {
+ switch (lowpan_dev(skb->dev)->lltype) {
case LOWPAN_LLTYPE_IEEE802154:
if (lowpan_802154_cb(skb)->d_size)
uh.len = htons(lowpan_802154_cb(skb)->d_size -
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a1e273af6..82a116ba5 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -290,6 +290,10 @@ static void vlan_sync_address(struct net_device *dev,
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
+ /* vlan continues to inherit address of lower device */
+ if (vlan_dev_inherit_address(vlandev, dev))
+ goto out;
+
/* vlan address was different from the old address and is equal to
* the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -302,6 +306,7 @@ static void vlan_sync_address(struct net_device *dev,
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
}
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 9d010a09a..cc1557978 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_device *real_dev,
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev);
static inline u32 vlan_get_ingress_priority(struct net_device *dev,
u16 vlan_tci)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e7e62570b..516b0e732 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -146,10 +146,12 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
{
- /* TODO: gotta make sure the underlying layer can handle it,
- * maybe an IFF_VLAN_CAPABLE flag for devices?
- */
- if (vlan_dev_priv(dev)->real_dev->mtu < new_mtu)
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ unsigned int max_mtu = real_dev->mtu;
+
+ if (netif_reduces_vlan_mtu(real_dev))
+ max_mtu -= VLAN_HLEN;
+ if (max_mtu < new_mtu)
return -ERANGE;
dev->mtu = new_mtu;
@@ -245,6 +247,17 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
}
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev)
+{
+ if (dev->addr_assign_type != NET_ADDR_STOLEN)
+ return false;
+
+ ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return true;
+}
+
static int vlan_dev_open(struct net_device *dev)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -255,7 +268,8 @@ static int vlan_dev_open(struct net_device *dev)
!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
return -ENETDOWN;
- if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+ !vlan_dev_inherit_address(dev, real_dev)) {
err = dev_uc_add(real_dev, dev->dev_addr);
if (err < 0)
goto out;
@@ -560,8 +574,10 @@ static int vlan_dev_init(struct net_device *dev)
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
- if (is_zero_ether_addr(dev->dev_addr))
- eth_hw_addr_inherit(dev, real_dev);
+ if (is_zero_ether_addr(dev->dev_addr)) {
+ ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ dev->addr_assign_type = NET_ADDR_STOLEN;
+ }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index c92b52f37..1270207f3 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -118,6 +118,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
struct net_device *real_dev;
+ unsigned int max_mtu;
__be16 proto;
int err;
@@ -144,9 +145,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
if (err < 0)
return err;
+ max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN :
+ real_dev->mtu;
if (!tb[IFLA_MTU])
- dev->mtu = real_dev->mtu;
- else if (dev->mtu > real_dev->mtu)
+ dev->mtu = max_mtu;
+ else if (dev->mtu > max_mtu)
return -EINVAL;
err = vlan_changelink(dev, tb, data);
diff --git a/net/9p/client.c b/net/9p/client.c
index ea79ee9a7..3fc94a49c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
if (err)
goto out_err;
- if (p9_is_proto_dotu(c))
+ if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
- if (!err || !IS_ERR_VALUE(err)) {
+ if (!err) {
err = p9_errstr2errno(ename, strlen(ename));
p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
@@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
if (err)
goto out_err;
- if (p9_is_proto_dotu(c))
+ if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
- if (!err || !IS_ERR_VALUE(err)) {
+ if (!err) {
err = p9_errstr2errno(ename, strlen(ename));
p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
diff --git a/net/Kconfig b/net/Kconfig
index a8934d8c8..ff40562a7 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -236,6 +236,7 @@ source "net/mpls/Kconfig"
source "net/hsr/Kconfig"
source "net/switchdev/Kconfig"
source "net/l3mdev/Kconfig"
+source "net/qrtr/Kconfig"
config RPS
bool
@@ -288,14 +289,17 @@ config BQL
config BPF_JIT
bool "enable BPF Just In Time compiler"
- depends on HAVE_BPF_JIT
+ depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
---help---
Berkeley Packet Filter filtering capabilities are normally handled
by an interpreter. This option allows kernel to generate a native
code when filter is loaded in memory. This should speedup
- packet sniffing (libpcap/tcpdump). Note : Admin should enable
- this feature changing /proc/sys/net/core/bpf_jit_enable
+ packet sniffing (libpcap/tcpdump).
+
+ Note, admin should enable this feature changing:
+ /proc/sys/net/core/bpf_jit_enable
+ /proc/sys/net/core/bpf_jit_harden (optional)
config NET_FLOW_LIMIT
bool
@@ -418,6 +422,14 @@ config MAY_USE_DEVLINK
endif # if NET
-# Used by archs to tell that they support BPF_JIT
-config HAVE_BPF_JIT
+# Used by archs to tell that they support BPF JIT compiler plus which flavour.
+# Only one of the two can be selected for a specific arch since eBPF JIT supersedes
+# the cBPF JIT.
+
+# Classic BPF JIT (cBPF)
+config HAVE_CBPF_JIT
+ bool
+
+# Extended BPF JIT (eBPF)
+config HAVE_EBPF_JIT
bool
diff --git a/net/Makefile b/net/Makefile
index 81d14119e..bdd14553a 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -78,3 +78,4 @@ endif
ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
obj-y += l3mdev/
endif
+obj-$(CONFIG_QRTR) += qrtr/
diff --git a/net/atm/lec.c b/net/atm/lec.c
index cd3b37989..e574a7e9d 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
static void lec_tx_timeout(struct net_device *dev)
{
pr_info("%s\n", dev->name);
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
netif_wake_queue(dev);
}
@@ -324,7 +324,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
out:
if (entry)
lec_arp_put(entry);
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
return NETDEV_TX_OK;
}
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 4fd6af473..adb6e3d21 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -124,7 +124,7 @@ as_indicate_complete:
break;
case as_addparty:
case as_dropparty:
- sk->sk_err_soft = msg->reply;
+ sk->sk_err_soft = -msg->reply;
/* < 0 failure, otherwise ep_ref */
clear_bit(ATM_VF_WAITING, &vcc->flags);
break;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 3fa0a9ee9..878563a83 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -546,7 +546,7 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
schedule();
}
finish_wait(sk_sleep(sk), &wait);
- error = xchg(&sk->sk_err_soft, 0);
+ error = -xchg(&sk->sk_err_soft, 0);
out:
release_sock(sk);
return error;
@@ -573,7 +573,7 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
error = -EUNATCH;
goto out;
}
- error = xchg(&sk->sk_err_soft, 0);
+ error = -xchg(&sk->sk_err_soft, 0);
out:
release_sock(sk);
return error;
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index cb2d1b9b0..ce2f20304 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -32,6 +32,7 @@
#include <linux/jiffies.h>
#include <linux/list.h>
#include <linux/kref.h>
+#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
#include <linux/printk.h>
@@ -156,10 +157,8 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
orig_node->bat_iv.bcast_own = data_ptr;
data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr) {
- kfree(orig_node->bat_iv.bcast_own);
+ if (!data_ptr)
goto unlock;
- }
memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
(max_if_num - 1) * sizeof(u8));
@@ -175,71 +174,107 @@ unlock:
}
/**
- * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to
- * exclude the removed interface
+ * batadv_iv_ogm_drop_bcast_own_entry - drop section of bcast_own
* @orig_node: the orig_node that has to be changed
* @max_if_num: the current amount of interfaces
* @del_if_num: the index of the interface being removed
- *
- * Return: 0 on success, a negative error code otherwise.
*/
-static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
- int max_if_num, int del_if_num)
+static void
+batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node,
+ int max_if_num, int del_if_num)
{
- int ret = -ENOMEM;
- size_t chunk_size, if_offset;
- void *data_ptr = NULL;
-
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ size_t chunk_size;
+ size_t if_offset;
+ void *data_ptr;
- /* last interface was removed */
- if (max_if_num == 0)
- goto free_bcast_own;
+ lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS;
data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC);
if (!data_ptr)
- goto unlock;
+ /* use old buffer when new one could not be allocated */
+ data_ptr = orig_node->bat_iv.bcast_own;
/* copy first part */
- memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
+ memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
/* copy second part */
if_offset = (del_if_num + 1) * chunk_size;
- memcpy((char *)data_ptr + del_if_num * chunk_size,
- (uint8_t *)orig_node->bat_iv.bcast_own + if_offset,
- (max_if_num - del_if_num) * chunk_size);
+ memmove((char *)data_ptr + del_if_num * chunk_size,
+ (uint8_t *)orig_node->bat_iv.bcast_own + if_offset,
+ (max_if_num - del_if_num) * chunk_size);
-free_bcast_own:
- kfree(orig_node->bat_iv.bcast_own);
- orig_node->bat_iv.bcast_own = data_ptr;
+ /* bcast_own was shrunk down in new buffer; free old one */
+ if (orig_node->bat_iv.bcast_own != data_ptr) {
+ kfree(orig_node->bat_iv.bcast_own);
+ orig_node->bat_iv.bcast_own = data_ptr;
+ }
+}
+
+/**
+ * batadv_iv_ogm_drop_bcast_own_sum_entry - drop section of bcast_own_sum
+ * @orig_node: the orig_node that has to be changed
+ * @max_if_num: the current amount of interfaces
+ * @del_if_num: the index of the interface being removed
+ */
+static void
+batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node,
+ int max_if_num, int del_if_num)
+{
+ size_t if_offset;
+ void *data_ptr;
- if (max_if_num == 0)
- goto free_own_sum;
+ lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr) {
- kfree(orig_node->bat_iv.bcast_own);
- goto unlock;
- }
+ if (!data_ptr)
+ /* use old buffer when new one could not be allocated */
+ data_ptr = orig_node->bat_iv.bcast_own_sum;
- memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
- del_if_num * sizeof(u8));
+ memmove(data_ptr, orig_node->bat_iv.bcast_own_sum,
+ del_if_num * sizeof(u8));
if_offset = (del_if_num + 1) * sizeof(u8);
- memcpy((char *)data_ptr + del_if_num * sizeof(u8),
- orig_node->bat_iv.bcast_own_sum + if_offset,
- (max_if_num - del_if_num) * sizeof(u8));
+ memmove((char *)data_ptr + del_if_num * sizeof(u8),
+ orig_node->bat_iv.bcast_own_sum + if_offset,
+ (max_if_num - del_if_num) * sizeof(u8));
+
+ /* bcast_own_sum was shrunk down in new buffer; free old one */
+ if (orig_node->bat_iv.bcast_own_sum != data_ptr) {
+ kfree(orig_node->bat_iv.bcast_own_sum);
+ orig_node->bat_iv.bcast_own_sum = data_ptr;
+ }
+}
-free_own_sum:
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own_sum = data_ptr;
+/**
+ * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to
+ * exclude the removed interface
+ * @orig_node: the orig_node that has to be changed
+ * @max_if_num: the current amount of interfaces
+ * @del_if_num: the index of the interface being removed
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
+ int max_if_num, int del_if_num)
+{
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ if (max_if_num == 0) {
+ kfree(orig_node->bat_iv.bcast_own);
+ kfree(orig_node->bat_iv.bcast_own_sum);
+ orig_node->bat_iv.bcast_own = NULL;
+ orig_node->bat_iv.bcast_own_sum = NULL;
+ } else {
+ batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num,
+ del_if_num);
+ batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num,
+ del_if_num);
+ }
- ret = 0;
-unlock:
spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
- return ret;
+ return 0;
}
/**
@@ -644,18 +679,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
unsigned char *skb_buff;
unsigned int skb_size;
- if (!kref_get_unless_zero(&if_incoming->refcount))
- return;
-
- if (!kref_get_unless_zero(&if_outgoing->refcount))
- goto out_free_incoming;
-
/* own packet should always be scheduled */
if (!own_packet) {
if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) {
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"batman packet queue full\n");
- goto out_free_outgoing;
+ return;
}
}
@@ -681,6 +710,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
forw_packet_aggr->packet_len = packet_len;
memcpy(skb_buff, packet_buff, packet_len);
+ kref_get(&if_incoming->refcount);
+ kref_get(&if_outgoing->refcount);
forw_packet_aggr->own = own_packet;
forw_packet_aggr->if_incoming = if_incoming;
forw_packet_aggr->if_outgoing = if_outgoing;
@@ -710,10 +741,6 @@ out_free_forw_packet:
out_nomem:
if (!own_packet)
atomic_inc(&bat_priv->batman_queue_left);
-out_free_outgoing:
- batadv_hardif_put(if_outgoing);
-out_free_incoming:
- batadv_hardif_put(if_incoming);
}
/* aggregate a new packet into the existing ogm packet */
@@ -950,9 +977,15 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) {
if (tmp_hard_iface->soft_iface != hard_iface->soft_iface)
continue;
+
+ if (!kref_get_unless_zero(&tmp_hard_iface->refcount))
+ continue;
+
batadv_iv_ogm_queue_add(bat_priv, *ogm_buff,
*ogm_buff_len, hard_iface,
tmp_hard_iface, 1, send_time);
+
+ batadv_hardif_put(tmp_hard_iface);
}
rcu_read_unlock();
@@ -1133,13 +1166,13 @@ out:
* @if_incoming: interface where the packet was received
* @if_outgoing: interface for which the retransmission should be considered
*
- * Return: 1 if the link can be considered bidirectional, 0 otherwise
+ * Return: true if the link can be considered bidirectional, false otherwise
*/
-static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *batadv_ogm_packet,
- struct batadv_hard_iface *if_incoming,
- struct batadv_hard_iface *if_outgoing)
+static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
{
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node;
@@ -1147,9 +1180,11 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
u8 total_count;
u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
- int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0;
+ int if_num;
+ unsigned int tq_asym_penalty, inv_asym_penalty;
unsigned int combined_tq;
- int tq_iface_penalty;
+ unsigned int tq_iface_penalty;
+ bool ret = false;
/* find corresponding one hop neighbor */
rcu_read_lock();
@@ -1261,7 +1296,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
* consider it bidirectional
*/
if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT)
- ret = 1;
+ ret = true;
out:
if (neigh_node)
@@ -1290,9 +1325,9 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
struct batadv_orig_ifinfo *orig_ifinfo = NULL;
struct batadv_neigh_node *neigh_node;
struct batadv_neigh_ifinfo *neigh_ifinfo;
- int is_dup;
+ bool is_dup;
s32 seq_diff;
- int need_update = 0;
+ bool need_update = false;
int set_mark;
enum batadv_dup_status ret = BATADV_NO_DUP;
u32 seqno = ntohl(batadv_ogm_packet->seqno);
@@ -1402,7 +1437,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
struct sk_buff *skb_priv;
struct ethhdr *ethhdr;
u8 *prev_sender;
- int is_bidirect;
+ bool is_bidirect;
/* create a private copy of the skb, as some functions change tq value
* and/or flags.
@@ -1730,8 +1765,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
if (hard_iface->soft_iface != bat_priv->soft_iface)
continue;
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node,
if_incoming, hard_iface);
+
+ batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -1829,9 +1869,8 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
int batman_count = 0;
u32 i;
- seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n",
- "Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE,
- "Nexthop", "outgoingIF", "Potential nexthops");
+ seq_puts(seq,
+ " Originator last-seen (#/255) Nexthop [outgoingIF]: Potential nexthops ...\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -1911,8 +1950,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
struct batadv_hard_iface *hard_iface;
int batman_count = 0;
- seq_printf(seq, " %10s %-13s %s\n",
- "IF", "Neighbor", "last-seen");
+ seq_puts(seq, " IF Neighbor last-seen\n");
rcu_read_lock();
list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 4026f198a..0a12e5cdd 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -27,6 +27,7 @@
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
+#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -39,6 +40,16 @@
static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (primary_if) {
+ batadv_v_elp_iface_activate(primary_if, hard_iface);
+ batadv_hardif_put(primary_if);
+ }
+
/* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can
* set the interface as ACTIVE right away, without any risk of race
* condition
@@ -72,16 +83,34 @@ static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
batadv_v_elp_iface_disable(hard_iface);
}
-static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
-{
-}
-
static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
{
batadv_v_elp_primary_iface_set(hard_iface);
batadv_v_ogm_primary_iface_set(hard_iface);
}
+/**
+ * batadv_v_iface_update_mac - react to hard-interface MAC address change
+ * @hard_iface: the modified interface
+ *
+ * If the modified interface is the primary one, update the originator
+ * address in the ELP and OGM messages to reflect the new MAC address.
+ */
+static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if != hard_iface)
+ goto out;
+
+ batadv_v_primary_iface_set(hard_iface);
+out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+}
+
static void
batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
{
@@ -162,8 +191,8 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
struct batadv_hard_iface *hard_iface;
int batman_count = 0;
- seq_printf(seq, " %-15s %s (%11s) [%10s]\n", "Neighbor",
- "last-seen", "throughput", "IF");
+ seq_puts(seq,
+ " Neighbor last-seen ( throughput) [ IF]\n");
rcu_read_lock();
list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
@@ -202,9 +231,8 @@ static void batadv_v_orig_print(struct batadv_priv *bat_priv,
int batman_count = 0;
u32 i;
- seq_printf(seq, " %-15s %s (%11s) %17s [%10s]: %20s ...\n",
- "Originator", "last-seen", "throughput", "Nexthop",
- "outgoingIF", "Potential nexthops");
+ seq_puts(seq,
+ " Originator last-seen ( throughput) Nexthop [outgoingIF]: Potential nexthops ...\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -256,14 +284,23 @@ static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing2)
{
struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+ int ret = 0;
ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ if (WARN_ON(!ifinfo1))
+ goto err_ifinfo1;
+
ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ if (WARN_ON(!ifinfo2))
+ goto err_ifinfo2;
- if (WARN_ON(!ifinfo1 || !ifinfo2))
- return 0;
+ ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
- return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
+ batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+ batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+ return ret;
}
static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
@@ -273,14 +310,26 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
{
struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
u32 threshold;
+ bool ret = false;
ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ if (WARN_ON(!ifinfo1))
+ goto err_ifinfo1;
+
ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ if (WARN_ON(!ifinfo2))
+ goto err_ifinfo2;
threshold = ifinfo1->bat_v.throughput / 4;
threshold = ifinfo1->bat_v.throughput - threshold;
- return ifinfo2->bat_v.throughput > threshold;
+ ret = ifinfo2->bat_v.throughput > threshold;
+
+ batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+ batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+ return ret;
}
static struct batadv_algo_ops batadv_batman_v __read_mostly = {
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 3844e7efd..df42eb136 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -377,6 +377,27 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
}
/**
+ * batadv_v_elp_iface_activate - update the ELP buffer belonging to the given
+ * hard-interface
+ * @primary_iface: the new primary interface
+ * @hard_iface: interface holding the to-be-updated buffer
+ */
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_elp_packet *elp_packet;
+ struct sk_buff *skb;
+
+ if (!hard_iface->bat_v.elp_skb)
+ return;
+
+ skb = hard_iface->bat_v.elp_skb;
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+ ether_addr_copy(elp_packet->orig,
+ primary_iface->net_dev->dev_addr);
+}
+
+/**
* batadv_v_elp_primary_iface_set - change internal data to reflect the new
* primary interface
* @primary_iface: the new primary interface
@@ -384,8 +405,6 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
{
struct batadv_hard_iface *hard_iface;
- struct batadv_elp_packet *elp_packet;
- struct sk_buff *skb;
/* update orig field of every elp iface belonging to this mesh */
rcu_read_lock();
@@ -393,13 +412,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
if (primary_iface->soft_iface != hard_iface->soft_iface)
continue;
- if (!hard_iface->bat_v.elp_skb)
- continue;
-
- skb = hard_iface->bat_v.elp_skb;
- elp_packet = (struct batadv_elp_packet *)skb->data;
- ether_addr_copy(elp_packet->orig,
- primary_iface->net_dev->dev_addr);
+ batadv_v_elp_iface_activate(primary_iface, hard_iface);
}
rcu_read_unlock();
}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index e95f1bca0..cc130b2d0 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -25,6 +25,8 @@ struct work_struct;
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+ struct batadv_hard_iface *hard_iface);
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 91df28a10..473ebb9a0 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -26,6 +26,7 @@
#include <linux/if_ether.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/random.h>
@@ -176,6 +177,9 @@ static void batadv_v_ogm_send(struct work_struct *work)
if (hard_iface->soft_iface != bat_priv->soft_iface)
continue;
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
ogm_packet->orig, ntohl(ogm_packet->seqno),
@@ -185,10 +189,13 @@ static void batadv_v_ogm_send(struct work_struct *work)
/* this skb gets consumed by batadv_v_ogm_send_to_if() */
skb_tmp = skb_clone(skb, GFP_ATOMIC);
- if (!skb_tmp)
+ if (!skb_tmp) {
+ batadv_hardif_put(hard_iface);
break;
+ }
batadv_v_ogm_send_to_if(skb_tmp, hard_iface);
+ batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -234,73 +241,6 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
}
/**
- * batadv_v_ogm_orig_update - update the originator status based on the received
- * OGM
- * @bat_priv: the bat priv with all the soft interface information
- * @orig_node: the originator to update
- * @neigh_node: the neighbour the OGM has been received from (to update)
- * @ogm2: the received OGM
- * @if_outgoing: the interface where this OGM is going to be forwarded through
- */
-static void
-batadv_v_ogm_orig_update(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_neigh_node *neigh_node,
- const struct batadv_ogm2_packet *ogm2,
- struct batadv_hard_iface *if_outgoing)
-{
- struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
- struct batadv_neigh_node *router = NULL;
- s32 neigh_seq_diff;
- u32 neigh_last_seqno;
- u32 router_last_seqno;
- u32 router_throughput, neigh_throughput;
-
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Searching and updating originator entry of received packet\n");
-
- /* if this neighbor already is our next hop there is nothing
- * to change
- */
- router = batadv_orig_router_get(orig_node, if_outgoing);
- if (router == neigh_node)
- goto out;
-
- /* don't consider neighbours with worse throughput.
- * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than
- * the last received seqno from our best next hop.
- */
- if (router) {
- router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
- neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
-
- /* if these are not allocated, something is wrong. */
- if (!router_ifinfo || !neigh_ifinfo)
- goto out;
-
- neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno;
- router_last_seqno = router_ifinfo->bat_v.last_seqno;
- neigh_seq_diff = neigh_last_seqno - router_last_seqno;
- router_throughput = router_ifinfo->bat_v.throughput;
- neigh_throughput = neigh_ifinfo->bat_v.throughput;
-
- if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) &&
- (router_throughput >= neigh_throughput))
- goto out;
- }
-
- batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
-
-out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (neigh_ifinfo)
- batadv_neigh_ifinfo_put(neigh_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
-}
-
-/**
* batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded
* with B.A.T.M.A.N. V OGMs
* @bat_priv: the bat priv with all the soft interface information
@@ -347,10 +287,12 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
}
/**
- * batadv_v_ogm_forward - forward an OGM to the given outgoing interface
+ * batadv_v_ogm_forward - check conditions and forward an OGM to the given
+ * outgoing interface
* @bat_priv: the bat priv with all the soft interface information
* @ogm_received: previously received OGM to be forwarded
- * @throughput: throughput to announce, may vary per outgoing interface
+ * @orig_node: the originator which has been updated
+ * @neigh_node: the neigh_node through with the OGM has been received
* @if_incoming: the interface on which this OGM was received on
* @if_outgoing: the interface to which the OGM has to be forwarded to
*
@@ -359,28 +301,57 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
*/
static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
const struct batadv_ogm2_packet *ogm_received,
- u32 throughput,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_neigh_node *router = NULL;
struct batadv_ogm2_packet *ogm_forward;
unsigned char *skb_buff;
struct sk_buff *skb;
size_t packet_len;
u16 tvlv_len;
+ /* only forward for specific interfaces, not for the default one. */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ goto out;
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out;
+
+ /* acquire possibly updated router */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+
+ /* strict rule: forward packets coming from the best next hop only */
+ if (neigh_node != router)
+ goto out;
+
+ /* don't forward the same seqno twice on one interface */
+ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno))
+ goto out;
+
+ orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno);
+
if (ogm_received->ttl <= 1) {
batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
- return;
+ goto out;
}
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
tvlv_len = ntohs(ogm_received->tvlv_len);
packet_len = BATADV_OGM2_HLEN + tvlv_len;
skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev,
ETH_HLEN + packet_len);
if (!skb)
- return;
+ goto out;
skb_reserve(skb, ETH_HLEN);
skb_buff = skb_put(skb, packet_len);
@@ -388,15 +359,23 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
/* apply forward penalty */
ogm_forward = (struct batadv_ogm2_packet *)skb_buff;
- ogm_forward->throughput = htonl(throughput);
+ ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput);
ogm_forward->ttl--;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n",
- if_outgoing->net_dev->name, throughput, ogm_forward->ttl,
- if_incoming->net_dev->name);
+ if_outgoing->net_dev->name, ntohl(ogm_forward->throughput),
+ ogm_forward->ttl, if_incoming->net_dev->name);
batadv_v_ogm_send_to_if(skb, if_outgoing);
+
+out:
+ if (orig_ifinfo)
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+ if (neigh_ifinfo)
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
}
/**
@@ -493,8 +472,10 @@ out:
* @neigh_node: the neigh_node through with the OGM has been received
* @if_incoming: the interface where this packet was received
* @if_outgoing: the interface for which the packet should be considered
+ *
+ * Return: true if the packet should be forwarded, false otherwise
*/
-static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
+static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
const struct ethhdr *ethhdr,
const struct batadv_ogm2_packet *ogm2,
struct batadv_orig_node *orig_node,
@@ -503,14 +484,14 @@ static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_neigh_node *router = NULL;
- struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
struct batadv_orig_node *orig_neigh_node = NULL;
- struct batadv_orig_ifinfo *orig_ifinfo = NULL;
struct batadv_neigh_node *orig_neigh_router = NULL;
-
- neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
- if (!neigh_ifinfo)
- goto out;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
+ u32 router_throughput, neigh_throughput;
+ u32 router_last_seqno;
+ u32 neigh_last_seqno;
+ s32 neigh_seq_diff;
+ bool forward = false;
orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source);
if (!orig_neigh_node)
@@ -529,49 +510,57 @@ static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
goto out;
}
- if (router) {
- batadv_neigh_node_put(router);
- router = NULL;
- }
+ /* Mark the OGM to be considered for forwarding, and update routes
+ * if needed.
+ */
+ forward = true;
- /* Update routes, and check if the OGM is from the best next hop */
- batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2,
- if_outgoing);
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Searching and updating originator entry of received packet\n");
- orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
- if (!orig_ifinfo)
+ /* if this neighbor already is our next hop there is nothing
+ * to change
+ */
+ if (router == neigh_node)
goto out;
- /* don't forward the same seqno twice on one interface */
- if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno))
- goto out;
+ /* don't consider neighbours with worse throughput.
+ * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than
+ * the last received seqno from our best next hop.
+ */
+ if (router) {
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
- /* acquire possibly updated router */
- router = batadv_orig_router_get(orig_node, if_outgoing);
+ /* if these are not allocated, something is wrong. */
+ if (!router_ifinfo || !neigh_ifinfo)
+ goto out;
- /* strict rule: forward packets coming from the best next hop only */
- if (neigh_node != router)
- goto out;
+ neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno;
+ router_last_seqno = router_ifinfo->bat_v.last_seqno;
+ neigh_seq_diff = neigh_last_seqno - router_last_seqno;
+ router_throughput = router_ifinfo->bat_v.throughput;
+ neigh_throughput = neigh_ifinfo->bat_v.throughput;
- /* only forward for specific interface, not for the default one. */
- if (if_outgoing != BATADV_IF_DEFAULT) {
- orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno);
- batadv_v_ogm_forward(bat_priv, ogm2,
- neigh_ifinfo->bat_v.throughput,
- if_incoming, if_outgoing);
+ if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) &&
+ (router_throughput >= neigh_throughput))
+ goto out;
}
+ batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
out:
- if (orig_ifinfo)
- batadv_orig_ifinfo_put(orig_ifinfo);
if (router)
batadv_neigh_node_put(router);
if (orig_neigh_router)
batadv_neigh_node_put(orig_neigh_router);
if (orig_neigh_node)
batadv_orig_node_put(orig_neigh_node);
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
if (neigh_ifinfo)
batadv_neigh_ifinfo_put(neigh_ifinfo);
+
+ return forward;
}
/**
@@ -594,6 +583,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_outgoing)
{
int seqno_age;
+ bool forward;
/* first, update the metric with according sanity checks */
seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node,
@@ -612,8 +602,14 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
ntohs(ogm2->tvlv_len));
/* if the metric update went through, update routes if needed */
- batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node,
- neigh_node, if_incoming, if_outgoing);
+ forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node,
+ neigh_node, if_incoming,
+ if_outgoing);
+
+ /* if the routes have been processed correctly, check and forward */
+ if (forward)
+ batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node,
+ if_incoming, if_outgoing);
}
/**
@@ -715,9 +711,14 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
if (hard_iface->soft_iface != bat_priv->soft_iface)
continue;
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
orig_node, neigh_node,
if_incoming, hard_iface);
+
+ batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
out:
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index b56bb000a..a0c791383 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -38,11 +38,11 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
* the last sequence number
* @set_mark: whether this packet should be marked in seq_bits
*
- * Return: 1 if the window was moved (either new or very old),
- * 0 if the window was not moved/shifted.
+ * Return: true if the window was moved (either new or very old),
+ * false if the window was not moved/shifted.
*/
-int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
- int set_mark)
+bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
+ s32 seq_num_diff, int set_mark)
{
struct batadv_priv *bat_priv = priv;
@@ -52,7 +52,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
if (seq_num_diff <= 0 && seq_num_diff > -BATADV_TQ_LOCAL_WINDOW_SIZE) {
if (set_mark)
batadv_set_bit(seq_bits, -seq_num_diff);
- return 0;
+ return false;
}
/* sequence number is slightly newer, so we shift the window and
@@ -63,7 +63,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
if (set_mark)
batadv_set_bit(seq_bits, 0);
- return 1;
+ return true;
}
/* sequence number is much newer, probably missed a lot of packets */
@@ -75,7 +75,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
if (set_mark)
batadv_set_bit(seq_bits, 0);
- return 1;
+ return true;
}
/* received a much older packet. The other host either restarted
@@ -94,5 +94,5 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
if (set_mark)
batadv_set_bit(seq_bits, 0);
- return 1;
+ return true;
}
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 3e41bb80e..0e6e9d090 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -22,6 +22,7 @@
#include <linux/bitops.h>
#include <linux/compiler.h>
+#include <linux/stddef.h>
#include <linux/types.h>
/**
@@ -31,17 +32,17 @@
* @last_seqno: latest sequence number in seq_bits
* @curr_seqno: sequence number to test for
*
- * Return: 1 if the corresponding bit in the given seq_bits indicates true
- * and curr_seqno is within range of last_seqno. Otherwise returns 0.
+ * Return: true if the corresponding bit in the given seq_bits indicates true
+ * and curr_seqno is within range of last_seqno. Otherwise returns false.
*/
-static inline int batadv_test_bit(const unsigned long *seq_bits,
- u32 last_seqno, u32 curr_seqno)
+static inline bool batadv_test_bit(const unsigned long *seq_bits,
+ u32 last_seqno, u32 curr_seqno)
{
s32 diff;
diff = last_seqno - curr_seqno;
if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE)
- return 0;
+ return false;
return test_bit(diff, seq_bits) != 0;
}
@@ -55,7 +56,7 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
set_bit(n, seq_bits); /* turn the position on */
}
-int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
- int set_mark);
+bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
+ s32 seq_num_diff, int set_mark);
#endif /* _NET_BATMAN_ADV_BITARRAY_H_ */
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 0a6c8b824..825a5cdf4 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -50,6 +50,7 @@
#include "hash.h"
#include "originator.h"
#include "packet.h"
+#include "sysfs.h"
#include "translation-table.h"
static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
@@ -100,10 +101,10 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
* @node: list node of the first entry to compare
* @data2: pointer to the second backbone gateway
*
- * Return: 1 if the backbones have the same data, 0 otherwise
+ * Return: true if the backbones have the same data, false otherwise
*/
-static int batadv_compare_backbone_gw(const struct hlist_node *node,
- const void *data2)
+static bool batadv_compare_backbone_gw(const struct hlist_node *node,
+ const void *data2)
{
const void *data1 = container_of(node, struct batadv_bla_backbone_gw,
hash_entry);
@@ -111,23 +112,23 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node,
const struct batadv_bla_backbone_gw *gw2 = data2;
if (!batadv_compare_eth(gw1->orig, gw2->orig))
- return 0;
+ return false;
if (gw1->vid != gw2->vid)
- return 0;
+ return false;
- return 1;
+ return true;
}
/**
- * batadv_compare_backbone_gw - compare address and vid of two claims
+ * batadv_compare_claim - compare address and vid of two claims
* @node: list node of the first entry to compare
* @data2: pointer to the second claims
*
- * Return: 1 if the claim have the same data, 0 otherwise
+ * Return: true if the claim have the same data, 0 otherwise
*/
-static int batadv_compare_claim(const struct hlist_node *node,
- const void *data2)
+static bool batadv_compare_claim(const struct hlist_node *node,
+ const void *data2)
{
const void *data1 = container_of(node, struct batadv_bla_claim,
hash_entry);
@@ -135,12 +136,12 @@ static int batadv_compare_claim(const struct hlist_node *node,
const struct batadv_bla_claim *cl2 = data2;
if (!batadv_compare_eth(cl1->addr, cl2->addr))
- return 0;
+ return false;
if (cl1->vid != cl2->vid)
- return 0;
+ return false;
- return 1;
+ return true;
}
/**
@@ -176,10 +177,21 @@ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw)
static void batadv_claim_release(struct kref *ref)
{
struct batadv_bla_claim *claim;
+ struct batadv_bla_backbone_gw *old_backbone_gw;
claim = container_of(ref, struct batadv_bla_claim, refcount);
- batadv_backbone_gw_put(claim->backbone_gw);
+ spin_lock_bh(&claim->backbone_lock);
+ old_backbone_gw = claim->backbone_gw;
+ claim->backbone_gw = NULL;
+ spin_unlock_bh(&claim->backbone_lock);
+
+ spin_lock_bh(&old_backbone_gw->crc_lock);
+ old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&old_backbone_gw->crc_lock);
+
+ batadv_backbone_gw_put(old_backbone_gw);
+
kfree_rcu(claim, rcu);
}
@@ -200,9 +212,9 @@ static void batadv_claim_put(struct batadv_bla_claim *claim)
*
* Return: claim if found or NULL otherwise.
*/
-static struct batadv_bla_claim
-*batadv_claim_hash_find(struct batadv_priv *bat_priv,
- struct batadv_bla_claim *data)
+static struct batadv_bla_claim *
+batadv_claim_hash_find(struct batadv_priv *bat_priv,
+ struct batadv_bla_claim *data)
{
struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
struct hlist_head *head;
@@ -407,11 +419,22 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
ethhdr->h_source, ethhdr->h_dest,
BATADV_PRINT_VID(vid));
break;
+ case BATADV_CLAIM_TYPE_LOOPDETECT:
+ ether_addr_copy(ethhdr->h_source, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_send_claim(): LOOPDETECT of %pM to %pM on vid %d\n",
+ ethhdr->h_source, ethhdr->h_dest,
+ BATADV_PRINT_VID(vid));
+
+ break;
}
- if (vid & BATADV_VLAN_HAS_TAG)
+ if (vid & BATADV_VLAN_HAS_TAG) {
skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
vid & VLAN_VID_MASK);
+ if (!skb)
+ goto out;
+ }
skb_reset_mac_header(skb);
skb->protocol = eth_type_trans(skb, soft_iface);
@@ -427,6 +450,36 @@ out:
}
/**
+ * batadv_bla_loopdetect_report - worker for reporting the loop
+ * @work: work queue item
+ *
+ * Throws an uevent, as the loopdetect check function can't do that itself
+ * since the kernel may sleep while throwing uevents.
+ */
+static void batadv_bla_loopdetect_report(struct work_struct *work)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_priv *bat_priv;
+ char vid_str[6] = { '\0' };
+
+ backbone_gw = container_of(work, struct batadv_bla_backbone_gw,
+ report_work);
+ bat_priv = backbone_gw->bat_priv;
+
+ batadv_info(bat_priv->soft_iface,
+ "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n",
+ BATADV_PRINT_VID(backbone_gw->vid));
+ snprintf(vid_str, sizeof(vid_str), "%d",
+ BATADV_PRINT_VID(backbone_gw->vid));
+ vid_str[sizeof(vid_str) - 1] = 0;
+
+ batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT,
+ vid_str);
+
+ batadv_backbone_gw_put(backbone_gw);
+}
+
+/**
* batadv_bla_get_backbone_gw - finds or creates a backbone gateway
* @bat_priv: the bat priv with all the soft interface information
* @orig: the mac address of the originator
@@ -464,6 +517,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
atomic_set(&entry->request_sent, 0);
atomic_set(&entry->wait_periods, 0);
ether_addr_copy(entry->orig, orig);
+ INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report);
/* one for the hash, one for returning */
kref_init(&entry->refcount);
@@ -634,8 +688,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
const u8 *mac, const unsigned short vid,
struct batadv_bla_backbone_gw *backbone_gw)
{
+ struct batadv_bla_backbone_gw *old_backbone_gw;
struct batadv_bla_claim *claim;
struct batadv_bla_claim search_claim;
+ bool remove_crc = false;
int hash_added;
ether_addr_copy(search_claim.addr, mac);
@@ -649,8 +705,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
return;
ether_addr_copy(claim->addr, mac);
+ spin_lock_init(&claim->backbone_lock);
claim->vid = vid;
claim->lasttime = jiffies;
+ kref_get(&backbone_gw->refcount);
claim->backbone_gw = backbone_gw;
kref_init(&claim->refcount);
@@ -678,15 +736,26 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
"bla_add_claim(): changing ownership for %pM, vid %d\n",
mac, BATADV_PRINT_VID(vid));
- spin_lock_bh(&claim->backbone_gw->crc_lock);
- claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
- spin_unlock_bh(&claim->backbone_gw->crc_lock);
- batadv_backbone_gw_put(claim->backbone_gw);
+ remove_crc = true;
}
- /* set (new) backbone gw */
+
+ /* replace backbone_gw atomically and adjust reference counters */
+ spin_lock_bh(&claim->backbone_lock);
+ old_backbone_gw = claim->backbone_gw;
kref_get(&backbone_gw->refcount);
claim->backbone_gw = backbone_gw;
+ spin_unlock_bh(&claim->backbone_lock);
+ if (remove_crc) {
+ /* remove claim address from old backbone_gw */
+ spin_lock_bh(&old_backbone_gw->crc_lock);
+ old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&old_backbone_gw->crc_lock);
+ }
+
+ batadv_backbone_gw_put(old_backbone_gw);
+
+ /* add claim address to new backbone_gw */
spin_lock_bh(&backbone_gw->crc_lock);
backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
spin_unlock_bh(&backbone_gw->crc_lock);
@@ -697,6 +766,26 @@ claim_free_ref:
}
/**
+ * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of
+ * claim
+ * @claim: claim whose backbone_gw should be returned
+ *
+ * Return: valid reference to claim::backbone_gw
+ */
+static struct batadv_bla_backbone_gw *
+batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ spin_lock_bh(&claim->backbone_lock);
+ backbone_gw = claim->backbone_gw;
+ kref_get(&backbone_gw->refcount);
+ spin_unlock_bh(&claim->backbone_lock);
+
+ return backbone_gw;
+}
+
+/**
* batadv_bla_del_claim - delete a claim from the claim hash
* @bat_priv: the bat priv with all the soft interface information
* @mac: mac address of the claim to be removed
@@ -720,10 +809,6 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
batadv_choose_claim, claim);
batadv_claim_put(claim); /* reference from the hash is gone */
- spin_lock_bh(&claim->backbone_gw->crc_lock);
- claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
- spin_unlock_bh(&claim->backbone_gw->crc_lock);
-
/* don't need the reference from hash_find() anymore */
batadv_claim_put(claim);
}
@@ -735,22 +820,22 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
* @backbone_addr: originator address of the sender (Ethernet source MAC)
* @vid: the VLAN ID of the frame
*
- * Return: 1 if handled
+ * Return: true if handled
*/
-static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
- u8 *backbone_addr, unsigned short vid)
+static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
+ u8 *backbone_addr, unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
u16 backbone_crc, crc;
if (memcmp(an_addr, batadv_announce_mac, 4) != 0)
- return 0;
+ return false;
backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid,
false);
if (unlikely(!backbone_gw))
- return 1;
+ return true;
/* handle as ANNOUNCE frame */
backbone_gw->lasttime = jiffies;
@@ -783,7 +868,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
}
batadv_backbone_gw_put(backbone_gw);
- return 1;
+ return true;
}
/**
@@ -794,29 +879,29 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
* @ethhdr: ethernet header of a packet
* @vid: the VLAN ID of the frame
*
- * Return: 1 if handled
+ * Return: true if handled
*/
-static int batadv_handle_request(struct batadv_priv *bat_priv,
- struct batadv_hard_iface *primary_if,
- u8 *backbone_addr, struct ethhdr *ethhdr,
- unsigned short vid)
+static bool batadv_handle_request(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ u8 *backbone_addr, struct ethhdr *ethhdr,
+ unsigned short vid)
{
/* check for REQUEST frame */
if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest))
- return 0;
+ return false;
/* sanity check, this should not happen on a normal switch,
* we ignore it in this case.
*/
if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr))
- return 1;
+ return true;
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"handle_request(): REQUEST vid %d (sent by %pM)...\n",
BATADV_PRINT_VID(vid), ethhdr->h_source);
batadv_bla_answer_request(bat_priv, primary_if, vid);
- return 1;
+ return true;
}
/**
@@ -827,12 +912,12 @@ static int batadv_handle_request(struct batadv_priv *bat_priv,
* @claim_addr: Client to be unclaimed (ARP sender HW MAC)
* @vid: the VLAN ID of the frame
*
- * Return: 1 if handled
+ * Return: true if handled
*/
-static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
- struct batadv_hard_iface *primary_if,
- u8 *backbone_addr, u8 *claim_addr,
- unsigned short vid)
+static bool batadv_handle_unclaim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ u8 *backbone_addr, u8 *claim_addr,
+ unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
@@ -845,7 +930,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid);
if (!backbone_gw)
- return 1;
+ return true;
/* this must be an UNCLAIM frame */
batadv_dbg(BATADV_DBG_BLA, bat_priv,
@@ -854,7 +939,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
batadv_bla_del_claim(bat_priv, claim_addr, vid);
batadv_backbone_gw_put(backbone_gw);
- return 1;
+ return true;
}
/**
@@ -865,12 +950,12 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
* @claim_addr: client mac address to be claimed (ARP sender HW MAC)
* @vid: the VLAN ID of the frame
*
- * Return: 1 if handled
+ * Return: true if handled
*/
-static int batadv_handle_claim(struct batadv_priv *bat_priv,
- struct batadv_hard_iface *primary_if,
- u8 *backbone_addr, u8 *claim_addr,
- unsigned short vid)
+static bool batadv_handle_claim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ u8 *backbone_addr, u8 *claim_addr,
+ unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
@@ -880,7 +965,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
false);
if (unlikely(!backbone_gw))
- return 1;
+ return true;
/* this must be a CLAIM frame */
batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw);
@@ -891,7 +976,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
/* TODO: we could call something like tt_local_del() here. */
batadv_backbone_gw_put(backbone_gw);
- return 1;
+ return true;
}
/**
@@ -975,12 +1060,12 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
* @primary_if: the primary hard interface of this batman soft interface
* @skb: the frame to be checked
*
- * Return: 1 if it was a claim frame, otherwise return 0 to
+ * Return: true if it was a claim frame, otherwise return false to
* tell the callee that it can use the frame on its own.
*/
-static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
- struct batadv_hard_iface *primary_if,
- struct sk_buff *skb)
+static bool batadv_bla_process_claim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct sk_buff *skb)
{
struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
u8 *hw_src, *hw_dst;
@@ -1011,7 +1096,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN,
&vhdr_buf);
if (!vhdr)
- return 0;
+ return false;
proto = vhdr->h_vlan_encapsulated_proto;
headlen += VLAN_HLEN;
@@ -1020,12 +1105,12 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
}
if (proto != htons(ETH_P_ARP))
- return 0; /* not a claim frame */
+ return false; /* not a claim frame */
/* this must be a ARP frame. check if it is a claim. */
if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev))))
- return 0;
+ return false;
/* pskb_may_pull() may have modified the pointers, get ethhdr again */
ethhdr = eth_hdr(skb);
@@ -1035,13 +1120,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
* IP information
*/
if (arphdr->ar_hrd != htons(ARPHRD_ETHER))
- return 0;
+ return false;
if (arphdr->ar_pro != htons(ETH_P_IP))
- return 0;
+ return false;
if (arphdr->ar_hln != ETH_ALEN)
- return 0;
+ return false;
if (arphdr->ar_pln != 4)
- return 0;
+ return false;
hw_src = (u8 *)arphdr + sizeof(struct arphdr);
hw_dst = hw_src + ETH_ALEN + 4;
@@ -1051,14 +1136,18 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
/* check if it is a claim frame in general */
if (memcmp(bla_dst->magic, bla_dst_own->magic,
sizeof(bla_dst->magic)) != 0)
- return 0;
+ return false;
/* check if there is a claim frame encapsulated deeper in (QinQ) and
* drop that, as this is not supported by BLA but should also not be
* sent via the mesh.
*/
if (vlan_depth > 1)
- return 1;
+ return true;
+
+ /* Let the loopdetect frames on the mesh in any case. */
+ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT)
+ return 0;
/* check if it is a claim frame. */
ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst,
@@ -1070,7 +1159,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
hw_dst);
if (ret < 2)
- return ret;
+ return !!ret;
/* become a backbone gw ourselves on this vlan if not happened yet */
batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
@@ -1080,30 +1169,30 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
case BATADV_CLAIM_TYPE_CLAIM:
if (batadv_handle_claim(bat_priv, primary_if, hw_src,
ethhdr->h_source, vid))
- return 1;
+ return true;
break;
case BATADV_CLAIM_TYPE_UNCLAIM:
if (batadv_handle_unclaim(bat_priv, primary_if,
ethhdr->h_source, hw_src, vid))
- return 1;
+ return true;
break;
case BATADV_CLAIM_TYPE_ANNOUNCE:
if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source,
vid))
- return 1;
+ return true;
break;
case BATADV_CLAIM_TYPE_REQUEST:
if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr,
vid))
- return 1;
+ return true;
break;
}
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"bla_process_claim(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n",
ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, hw_dst);
- return 1;
+ return true;
}
/**
@@ -1172,6 +1261,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
int now)
{
+ struct batadv_bla_backbone_gw *backbone_gw;
struct batadv_bla_claim *claim;
struct hlist_head *head;
struct batadv_hashtable *hash;
@@ -1186,14 +1276,17 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv,
rcu_read_lock();
hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
if (now)
goto purge_now;
- if (!batadv_compare_eth(claim->backbone_gw->orig,
+
+ if (!batadv_compare_eth(backbone_gw->orig,
primary_if->net_dev->dev_addr))
- continue;
+ goto skip;
+
if (!batadv_has_timed_out(claim->lasttime,
BATADV_BLA_CLAIM_TIMEOUT))
- continue;
+ goto skip;
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"bla_purge_claims(): %pM, vid %d, time out\n",
@@ -1201,8 +1294,10 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv,
purge_now:
batadv_handle_unclaim(bat_priv, primary_if,
- claim->backbone_gw->orig,
+ backbone_gw->orig,
claim->addr, claim->vid);
+skip:
+ batadv_backbone_gw_put(backbone_gw);
}
rcu_read_unlock();
}
@@ -1265,6 +1360,26 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
}
/**
+ * batadv_bla_send_loopdetect - send a loopdetect frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @backbone_gw: the backbone gateway for which a loop should be detected
+ *
+ * To detect loops that the bridge loop avoidance can't handle, send a loop
+ * detection packet on the backbone. Unlike other BLA frames, this frame will
+ * be allowed on the mesh by other nodes. If it is received on the mesh, this
+ * indicates that there is a loop.
+ */
+static void
+batadv_bla_send_loopdetect(struct batadv_priv *bat_priv,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n",
+ backbone_gw->vid);
+ batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr,
+ backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT);
+}
+
+/**
* batadv_bla_status_update - purge bla interfaces if necessary
* @net_dev: the soft interface net device
*/
@@ -1301,9 +1416,10 @@ static void batadv_bla_periodic_work(struct work_struct *work)
struct batadv_bla_backbone_gw *backbone_gw;
struct batadv_hashtable *hash;
struct batadv_hard_iface *primary_if;
+ bool send_loopdetect = false;
int i;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
priv_bla = container_of(delayed_work, struct batadv_priv_bla, work);
bat_priv = container_of(priv_bla, struct batadv_priv, bla);
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -1316,6 +1432,22 @@ static void batadv_bla_periodic_work(struct work_struct *work)
if (!atomic_read(&bat_priv->bridge_loop_avoidance))
goto out;
+ if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) {
+ /* set a new random mac address for the next bridge loop
+ * detection frames. Set the locally administered bit to avoid
+ * collisions with users mac addresses.
+ */
+ random_ether_addr(bat_priv->bla.loopdetect_addr);
+ bat_priv->bla.loopdetect_addr[0] = 0xba;
+ bat_priv->bla.loopdetect_addr[1] = 0xbe;
+ bat_priv->bla.loopdetect_lasttime = jiffies;
+ atomic_set(&bat_priv->bla.loopdetect_next,
+ BATADV_BLA_LOOPDETECT_PERIODS);
+
+ /* mark for sending loop detect on all VLANs */
+ send_loopdetect = true;
+ }
+
hash = bat_priv->bla.backbone_hash;
if (!hash)
goto out;
@@ -1332,6 +1464,9 @@ static void batadv_bla_periodic_work(struct work_struct *work)
backbone_gw->lasttime = jiffies;
batadv_bla_send_announce(bat_priv, backbone_gw);
+ if (send_loopdetect)
+ batadv_bla_send_loopdetect(bat_priv,
+ backbone_gw);
/* request_sent is only set after creation to avoid
* problems when we are not yet known as backbone gw
@@ -1405,6 +1540,9 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
bat_priv->bla.bcast_duplist[i].entrytime = entrytime;
bat_priv->bla.bcast_duplist_curr = 0;
+ atomic_set(&bat_priv->bla.loopdetect_next,
+ BATADV_BLA_LOOPDETECT_PERIODS);
+
if (bat_priv->bla.claim_hash)
return 0;
@@ -1442,15 +1580,16 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
* sent by another host, drop it. We allow equal packets from
* the same host however as this might be intended.
*
- * Return: 1 if a packet is in the duplicate list, 0 otherwise.
+ * Return: true if a packet is in the duplicate list, false otherwise.
*/
-int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
+bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
{
- int i, curr, ret = 0;
+ int i, curr;
__be32 crc;
struct batadv_bcast_packet *bcast_packet;
struct batadv_bcast_duplist_entry *entry;
+ bool ret = false;
bcast_packet = (struct batadv_bcast_packet *)skb->data;
@@ -1478,9 +1617,9 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
continue;
/* this entry seems to match: same crc, not too old,
- * and from another gw. therefore return 1 to forbid it.
+ * and from another gw. therefore return true to forbid it.
*/
- ret = 1;
+ ret = true;
goto out;
}
/* not found, add a new entry (overwrite the oldest entry)
@@ -1546,21 +1685,21 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
* @orig_node: the orig_node of the frame
* @hdr_size: maximum length of the frame
*
- * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise
- * it returns 0.
+ * Return: true if the orig_node is also a gateway on the soft interface,
+ * otherwise it returns false.
*/
-int batadv_bla_is_backbone_gw(struct sk_buff *skb,
- struct batadv_orig_node *orig_node, int hdr_size)
+bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node, int hdr_size)
{
struct batadv_bla_backbone_gw *backbone_gw;
unsigned short vid;
if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance))
- return 0;
+ return false;
/* first, find out the vid. */
if (!pskb_may_pull(skb, hdr_size + ETH_HLEN))
- return 0;
+ return false;
vid = batadv_get_vid(skb, hdr_size);
@@ -1568,14 +1707,14 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb,
backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv,
orig_node->orig, vid);
if (!backbone_gw)
- return 0;
+ return false;
batadv_backbone_gw_put(backbone_gw);
- return 1;
+ return true;
}
/**
- * batadv_bla_init - free all bla structures
+ * batadv_bla_free - free all bla structures
* @bat_priv: the bat priv with all the soft interface information
*
* for softinterface free or module unload
@@ -1602,6 +1741,55 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
}
/**
+ * batadv_bla_loopdetect_check - check and handle a detected loop
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the packet to check
+ * @primary_if: interface where the request came on
+ * @vid: the VLAN ID of the frame
+ *
+ * Checks if this packet is a loop detect frame which has been sent by us,
+ * throw an uevent and log the event if that is the case.
+ *
+ * Return: true if it is a loop detect frame which is to be dropped, false
+ * otherwise.
+ */
+static bool
+batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_hard_iface *primary_if,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct ethhdr *ethhdr;
+
+ ethhdr = eth_hdr(skb);
+
+ /* Only check for the MAC address and skip more checks here for
+ * performance reasons - this function is on the hotpath, after all.
+ */
+ if (!batadv_compare_eth(ethhdr->h_source,
+ bat_priv->bla.loopdetect_addr))
+ return false;
+
+ /* If the packet came too late, don't forward it on the mesh
+ * but don't consider that as loop. It might be a coincidence.
+ */
+ if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime,
+ BATADV_BLA_LOOPDETECT_TIMEOUT))
+ return true;
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv,
+ primary_if->net_dev->dev_addr,
+ vid, true);
+ if (unlikely(!backbone_gw))
+ return true;
+
+ queue_work(batadv_event_workqueue, &backbone_gw->report_work);
+ /* backbone_gw is unreferenced in the report work function function */
+
+ return true;
+}
+
+/**
* batadv_bla_rx - check packets coming from the mesh.
* @bat_priv: the bat priv with all the soft interface information
* @skb: the frame to be checked
@@ -1614,16 +1802,18 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
*
* in these cases, the skb is further handled by this function
*
- * Return: 1 if handled, otherwise it returns 0 and the caller shall further
- * process the skb.
+ * Return: true if handled, otherwise it returns false and the caller shall
+ * further process the skb.
*/
-int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
- unsigned short vid, bool is_bcast)
+bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, bool is_bcast)
{
+ struct batadv_bla_backbone_gw *backbone_gw;
struct ethhdr *ethhdr;
struct batadv_bla_claim search_claim, *claim = NULL;
struct batadv_hard_iface *primary_if;
- int ret;
+ bool own_claim;
+ bool ret;
ethhdr = eth_hdr(skb);
@@ -1634,6 +1824,9 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
if (!atomic_read(&bat_priv->bridge_loop_avoidance))
goto allow;
+ if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid))
+ goto handled;
+
if (unlikely(atomic_read(&bat_priv->bla.num_requests)))
/* don't allow broadcasts while requests are in flight */
if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast)
@@ -1654,8 +1847,12 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
}
/* if it is our own claim ... */
- if (batadv_compare_eth(claim->backbone_gw->orig,
- primary_if->net_dev->dev_addr)) {
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
+ own_claim = batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr);
+ batadv_backbone_gw_put(backbone_gw);
+
+ if (own_claim) {
/* ... allow it in any case */
claim->lasttime = jiffies;
goto allow;
@@ -1682,12 +1879,12 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
}
allow:
batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
- ret = 0;
+ ret = false;
goto out;
handled:
kfree_skb(skb);
- ret = 1;
+ ret = true;
out:
if (primary_if)
@@ -1711,16 +1908,18 @@ out:
*
* This call might reallocate skb data.
*
- * Return: 1 if handled, otherwise it returns 0 and the caller shall further
- * process the skb.
+ * Return: true if handled, otherwise it returns false and the caller shall
+ * further process the skb.
*/
-int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
- unsigned short vid)
+bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
{
struct ethhdr *ethhdr;
struct batadv_bla_claim search_claim, *claim = NULL;
+ struct batadv_bla_backbone_gw *backbone_gw;
struct batadv_hard_iface *primary_if;
- int ret = 0;
+ bool client_roamed;
+ bool ret = false;
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
@@ -1749,8 +1948,12 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
goto allow;
/* check if we are responsible. */
- if (batadv_compare_eth(claim->backbone_gw->orig,
- primary_if->net_dev->dev_addr)) {
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
+ client_roamed = batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr);
+ batadv_backbone_gw_put(backbone_gw);
+
+ if (client_roamed) {
/* if yes, the client has roamed and we have
* to unclaim it.
*/
@@ -1774,10 +1977,10 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
}
allow:
batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
- ret = 0;
+ ret = false;
goto out;
handled:
- ret = 1;
+ ret = true;
out:
if (primary_if)
batadv_hardif_put(primary_if);
@@ -1798,6 +2001,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
struct net_device *net_dev = (struct net_device *)seq->private;
struct batadv_priv *bat_priv = netdev_priv(net_dev);
struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
+ struct batadv_bla_backbone_gw *backbone_gw;
struct batadv_bla_claim *claim;
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
@@ -1815,24 +2019,28 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
"Claims announced for the mesh %s (orig %pM, group id %#.4x)\n",
net_dev->name, primary_addr,
ntohs(bat_priv->bla.claim_dest.group));
- seq_printf(seq, " %-17s %-5s %-17s [o] (%-6s)\n",
- "Client", "VID", "Originator", "CRC");
+ seq_puts(seq,
+ " Client VID Originator [o] (CRC )\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
rcu_read_lock();
hlist_for_each_entry_rcu(claim, head, hash_entry) {
- is_own = batadv_compare_eth(claim->backbone_gw->orig,
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
+
+ is_own = batadv_compare_eth(backbone_gw->orig,
primary_addr);
- spin_lock_bh(&claim->backbone_gw->crc_lock);
- backbone_crc = claim->backbone_gw->crc;
- spin_unlock_bh(&claim->backbone_gw->crc_lock);
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_crc = backbone_gw->crc;
+ spin_unlock_bh(&backbone_gw->crc_lock);
seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n",
claim->addr, BATADV_PRINT_VID(claim->vid),
- claim->backbone_gw->orig,
+ backbone_gw->orig,
(is_own ? 'x' : ' '),
backbone_crc);
+
+ batadv_backbone_gw_put(backbone_gw);
}
rcu_read_unlock();
}
@@ -1873,8 +2081,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
"Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n",
net_dev->name, primary_addr,
ntohs(bat_priv->bla.claim_dest.group));
- seq_printf(seq, " %-17s %-5s %-9s (%-6s)\n",
- "Originator", "VID", "last seen", "CRC");
+ seq_puts(seq, " Originator VID last seen (CRC )\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 579f0fa6f..0f01daeb3 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -27,19 +27,20 @@ struct seq_file;
struct sk_buff;
#ifdef CONFIG_BATMAN_ADV_BLA
-int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
- unsigned short vid, bool is_bcast);
-int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
- unsigned short vid);
-int batadv_bla_is_backbone_gw(struct sk_buff *skb,
- struct batadv_orig_node *orig_node, int hdr_size);
+bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, bool is_bcast);
+bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid);
+bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int hdr_size);
int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
void *offset);
bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid);
-int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
- struct sk_buff *skb);
+bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
struct batadv_hard_iface *oldif);
@@ -50,24 +51,24 @@ void batadv_bla_free(struct batadv_priv *bat_priv);
#define BATADV_BLA_CRC_INIT 0
#else /* ifdef CONFIG_BATMAN_ADV_BLA */
-static inline int batadv_bla_rx(struct batadv_priv *bat_priv,
- struct sk_buff *skb, unsigned short vid,
- bool is_bcast)
+static inline bool batadv_bla_rx(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ bool is_bcast)
{
- return 0;
+ return false;
}
-static inline int batadv_bla_tx(struct batadv_priv *bat_priv,
- struct sk_buff *skb, unsigned short vid)
+static inline bool batadv_bla_tx(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
{
- return 0;
+ return false;
}
-static inline int batadv_bla_is_backbone_gw(struct sk_buff *skb,
- struct batadv_orig_node *orig_node,
- int hdr_size)
+static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int hdr_size)
{
- return 0;
+ return false;
}
static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq,
@@ -88,11 +89,11 @@ static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv,
return false;
}
-static inline int
+static inline bool
batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
struct sk_buff *skb)
{
- return 0;
+ return false;
}
static inline void
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 48253cf83..952900466 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -134,7 +134,7 @@ static int batadv_log_release(struct inode *inode, struct file *file)
return 0;
}
-static int batadv_log_empty(struct batadv_priv_debug_log *debug_log)
+static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log)
{
return !(debug_log->log_start - debug_log->log_end);
}
@@ -365,14 +365,17 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
#define BATADV_DEBUGINFO(_name, _mode, _open) \
struct batadv_debuginfo batadv_debuginfo_##_name = { \
- .attr = { .name = __stringify(_name), \
- .mode = _mode, }, \
- .fops = { .owner = THIS_MODULE, \
- .open = _open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
- } \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = _mode, \
+ }, \
+ .fops = { \
+ .owner = THIS_MODULE, \
+ .open = _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ }, \
}
/* the following attributes are general and therefore they will be directly
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 3e6b2624f..aee3b3991 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -152,7 +152,7 @@ static void batadv_dat_purge(struct work_struct *work)
struct batadv_priv_dat *priv_dat;
struct batadv_priv *bat_priv;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
priv_dat = container_of(delayed_work, struct batadv_priv_dat, work);
bat_priv = container_of(priv_dat, struct batadv_priv, dat);
@@ -165,14 +165,14 @@ static void batadv_dat_purge(struct work_struct *work)
* @node: node in the local table
* @data2: second object to compare the node to
*
- * Return: 1 if the two entries are the same, 0 otherwise.
+ * Return: true if the two entries are the same, false otherwise.
*/
-static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
+static bool batadv_compare_dat(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_dat_entry,
hash_entry);
- return memcmp(data1, data2, sizeof(__be32)) == 0 ? 1 : 0;
+ return memcmp(data1, data2, sizeof(__be32)) == 0;
}
/**
@@ -720,7 +720,7 @@ void batadv_dat_status_update(struct net_device *net_dev)
}
/**
- * batadv_gw_tvlv_ogm_handler_v1 - process incoming dat tvlv container
+ * batadv_dat_tvlv_ogm_handler_v1 - process incoming dat tvlv container
* @bat_priv: the bat priv with all the soft interface information
* @orig: the orig_node of the ogm
* @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
@@ -817,8 +817,8 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
goto out;
seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name);
- seq_printf(seq, " %-7s %-9s %4s %11s\n", "IPv4",
- "MAC", "VID", "last-seen");
+ seq_puts(seq,
+ " IPv4 MAC VID last-seen\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -1009,9 +1009,12 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
if (!skb_new)
goto out;
- if (vid & BATADV_VLAN_HAS_TAG)
+ if (vid & BATADV_VLAN_HAS_TAG) {
skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
vid & VLAN_VID_MASK);
+ if (!skb_new)
+ goto out;
+ }
skb_reset_mac_header(skb_new);
skb_new->protocol = eth_type_trans(skb_new,
@@ -1089,9 +1092,12 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
*/
skb_reset_mac_header(skb_new);
- if (vid & BATADV_VLAN_HAS_TAG)
+ if (vid & BATADV_VLAN_HAS_TAG) {
skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
vid & VLAN_VID_MASK);
+ if (!skb_new)
+ goto out;
+ }
/* To preserve backwards compatibility, the node has choose the outgoing
* format based on the incoming request packet type. The assumption is
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index e6956d074..65536db1b 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -407,8 +407,8 @@ static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
unsigned int mtu)
{
struct sk_buff *skb_fragment;
- unsigned header_size = sizeof(*frag_head);
- unsigned fragment_size = mtu - header_size;
+ unsigned int header_size = sizeof(*frag_head);
+ unsigned int fragment_size = mtu - header_size;
skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
if (!skb_fragment)
@@ -444,15 +444,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_hard_iface *primary_if = NULL;
struct batadv_frag_packet frag_header;
struct sk_buff *skb_fragment;
- unsigned mtu = neigh_node->if_incoming->net_dev->mtu;
- unsigned header_size = sizeof(frag_header);
- unsigned max_fragment_size, max_packet_size;
+ unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
+ unsigned int header_size = sizeof(frag_header);
+ unsigned int max_fragment_size, max_packet_size;
bool ret = false;
/* To avoid merge and refragmentation at next-hops we never send
* fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
*/
- mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
+ mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
max_fragment_size = mtu - header_size;
max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index c59aff5cc..5839c569f 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -135,8 +135,8 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
spin_lock_bh(&bat_priv->gw.list_lock);
- if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount))
- new_gw_node = NULL;
+ if (new_gw_node)
+ kref_get(&new_gw_node->refcount);
curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1);
rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node);
@@ -440,15 +440,11 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
if (gateway->bandwidth_down == 0)
return;
- if (!kref_get_unless_zero(&orig_node->refcount))
- return;
-
gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC);
- if (!gw_node) {
- batadv_orig_node_put(orig_node);
+ if (!gw_node)
return;
- }
+ kref_get(&orig_node->refcount);
INIT_HLIST_NODE(&gw_node->list);
gw_node->orig_node = orig_node;
gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 0a7deaf26..8c2f39962 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -36,7 +36,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
-#include <net/net_namespace.h>
#include "bridge_loop_avoidance.h"
#include "debugfs.h"
@@ -121,6 +120,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1,
static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
{
struct net_device *parent_dev;
+ struct net *net = dev_net(net_dev);
bool ret;
/* check if this is a batman-adv mesh interface */
@@ -133,7 +133,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
return false;
/* recurse over the parent device */
- parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev));
+ parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev));
/* if we got a NULL parent_dev there is something broken.. */
if (WARN(!parent_dev, "Cannot find parent device"))
return false;
@@ -146,22 +146,22 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
return ret;
}
-static int batadv_is_valid_iface(const struct net_device *net_dev)
+static bool batadv_is_valid_iface(const struct net_device *net_dev)
{
if (net_dev->flags & IFF_LOOPBACK)
- return 0;
+ return false;
if (net_dev->type != ARPHRD_ETHER)
- return 0;
+ return false;
if (net_dev->addr_len != ETH_ALEN)
- return 0;
+ return false;
/* no batman over batman */
if (batadv_is_on_batman_iface(net_dev))
- return 0;
+ return false;
- return 1;
+ return true;
}
/**
@@ -236,8 +236,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
ASSERT_RTNL();
- if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount))
- new_hard_iface = NULL;
+ if (new_hard_iface)
+ kref_get(&new_hard_iface->refcount);
curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1);
rcu_assign_pointer(bat_priv->primary_if, new_hard_iface);
@@ -456,7 +456,7 @@ static int batadv_master_del_slave(struct batadv_hard_iface *slave,
}
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
- const char *iface_name)
+ struct net *net, const char *iface_name)
{
struct batadv_priv *bat_priv;
struct net_device *soft_iface, *master;
@@ -467,13 +467,12 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
goto out;
- if (!kref_get_unless_zero(&hard_iface->refcount))
- goto out;
+ kref_get(&hard_iface->refcount);
- soft_iface = dev_get_by_name(&init_net, iface_name);
+ soft_iface = dev_get_by_name(net, iface_name);
if (!soft_iface) {
- soft_iface = batadv_softif_create(iface_name);
+ soft_iface = batadv_softif_create(net, iface_name);
if (!soft_iface) {
ret = -ENOMEM;
@@ -522,6 +521,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
goto err_upper;
}
+ kref_get(&hard_iface->refcount);
hard_iface->batman_adv_ptype.type = ethertype;
hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv;
hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
@@ -583,6 +583,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_info(hard_iface->soft_iface, "Removing interface: %s\n",
hard_iface->net_dev->name);
dev_remove_pack(&hard_iface->batman_adv_ptype);
+ batadv_hardif_put(hard_iface);
bat_priv->num_ifaces--;
batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
@@ -652,8 +653,7 @@ batadv_hardif_add_interface(struct net_device *net_dev)
ASSERT_RTNL();
- ret = batadv_is_valid_iface(net_dev);
- if (ret != 1)
+ if (!batadv_is_valid_iface(net_dev))
goto out;
dev_hold(net_dev);
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d74f1983f..a76724d36 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -28,6 +28,7 @@
#include <linux/types.h>
struct net_device;
+struct net;
enum batadv_hard_if_state {
BATADV_IF_NOT_IN_USE,
@@ -55,7 +56,7 @@ bool batadv_is_wifi_iface(int ifindex);
struct batadv_hard_iface*
batadv_hardif_get_by_netdev(const struct net_device *net_dev);
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
- const char *iface_name);
+ struct net *net, const char *iface_name);
void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
enum batadv_hard_if_cleanup autodel);
void batadv_hardif_remove_interfaces(void);
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 9bb57b874..cbbf87075 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -32,10 +32,10 @@ struct lock_class_key;
/* callback to a compare function. should compare 2 element datas for their
* keys
*
- * Return: 0 if same and not 0 if not same
+ * Return: true if same and false if not same
*/
-typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *,
- const void *);
+typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *,
+ const void *);
/* the hashfunction
*
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 14d0013b3..777aea10c 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -104,25 +104,21 @@ static int batadv_socket_open(struct inode *inode, struct file *file)
static int batadv_socket_release(struct inode *inode, struct file *file)
{
- struct batadv_socket_client *socket_client = file->private_data;
- struct batadv_socket_packet *socket_packet;
- struct list_head *list_pos, *list_pos_tmp;
+ struct batadv_socket_client *client = file->private_data;
+ struct batadv_socket_packet *packet, *tmp;
- spin_lock_bh(&socket_client->lock);
+ spin_lock_bh(&client->lock);
/* for all packets in the queue ... */
- list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) {
- socket_packet = list_entry(list_pos,
- struct batadv_socket_packet, list);
-
- list_del(list_pos);
- kfree(socket_packet);
+ list_for_each_entry_safe(packet, tmp, &client->queue_list, list) {
+ list_del(&packet->list);
+ kfree(packet);
}
- batadv_socket_client_hash[socket_client->index] = NULL;
- spin_unlock_bh(&socket_client->lock);
+ batadv_socket_client_hash[client->index] = NULL;
+ spin_unlock_bh(&client->lock);
- kfree(socket_client);
+ kfree(client);
module_put(THIS_MODULE);
return 0;
@@ -337,7 +333,7 @@ err:
}
/**
- * batadv_socket_receive_packet - schedule an icmp packet to be sent to
+ * batadv_socket_add_packet - schedule an icmp packet to be sent to
* userspace on an icmp socket.
* @socket_client: the socket this packet belongs to
* @icmph: pointer to the header of the icmp packet
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d64ddb961..5f2974bd1 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -401,11 +401,19 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
hard_iface = container_of(ptype, struct batadv_hard_iface,
batman_adv_ptype);
+
+ /* Prevent processing a packet received on an interface which is getting
+ * shut down otherwise the packet may trigger de-reference errors
+ * further down in the receive path.
+ */
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ goto err_out;
+
skb = skb_share_check(skb, GFP_ATOMIC);
/* skb was released by skb_share_check() */
if (!skb)
- goto err_out;
+ goto err_put;
/* packet should hold at least type and version */
if (unlikely(!pskb_may_pull(skb, 2)))
@@ -448,6 +456,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
if (ret == NET_RX_DROP)
kfree_skb(skb);
+ batadv_hardif_put(hard_iface);
+
/* return NET_RX_SUCCESS in any case as we
* most probably dropped the packet for
* routing-logical reasons.
@@ -456,6 +466,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
err_free:
kfree_skb(skb);
+err_put:
+ batadv_hardif_put(hard_iface);
err_out:
return NET_RX_DROP;
}
@@ -663,8 +675,8 @@ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
*
* Return: tvlv handler if found or NULL otherwise.
*/
-static struct batadv_tvlv_handler
-*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
+static struct batadv_tvlv_handler *
+batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
{
struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL;
@@ -722,8 +734,8 @@ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
*
* Return: tvlv container if found or NULL otherwise.
*/
-static struct batadv_tvlv_container
-*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
+static struct batadv_tvlv_container *
+batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
{
struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
@@ -736,9 +748,7 @@ static struct batadv_tvlv_container
if (tvlv_tmp->tvlv_hdr.version != version)
continue;
- if (!kref_get_unless_zero(&tvlv_tmp->refcount))
- continue;
-
+ kref_get(&tvlv_tmp->refcount);
tvlv = tvlv_tmp;
break;
}
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index db4533631..76925266d 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.1"
+#define BATADV_SOURCE_VERSION "2016.2"
#endif
/* B.A.T.M.A.N. parameters */
@@ -120,6 +120,8 @@
#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6)
#define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10)
#define BATADV_BLA_WAIT_PERIODS 3
+#define BATADV_BLA_LOOPDETECT_PERIODS 6
+#define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */
#define BATADV_DUPLIST_SIZE 16
#define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */
@@ -142,10 +144,12 @@ enum batadv_uev_action {
BATADV_UEV_ADD = 0,
BATADV_UEV_DEL,
BATADV_UEV_CHANGE,
+ BATADV_UEV_LOOPDETECT,
};
enum batadv_uev_type {
BATADV_UEV_GW = 0,
+ BATADV_UEV_BLA,
};
#define BATADV_GW_THRESHOLD 50
@@ -288,7 +292,7 @@ static inline void _batadv_dbg(int type __always_unused,
*
* note: can't use ether_addr_equal() as it requires aligned memory
*
- * Return: 1 if they are the same ethernet addr
+ * Return: true if they are the same ethernet addr
*/
static inline bool batadv_compare_eth(const void *data1, const void *data2)
{
@@ -296,7 +300,8 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2)
}
/**
- * has_timed_out - compares current time (jiffies) and timestamp + timeout
+ * batadv_has_timed_out - compares current time (jiffies) and timestamp +
+ * timeout
* @timestamp: base value to compare with (in jiffies)
* @timeout: added to base value before comparing (in milliseconds)
*
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 8caa2c72e..c32f24faf 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -394,7 +394,8 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_want_all_ip_count - count nodes with unspecific mcast interest
+ * batadv_mcast_forw_want_all_ip_count - count nodes with unspecific mcast
+ * interest
* @bat_priv: the bat priv with all the soft interface information
* @ethhdr: ethernet header of a packet
*
@@ -433,7 +434,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag
+ * batadv_mcast_forw_ipv4_node_get - get a node with an ipv4 flag
* @bat_priv: the bat priv with all the soft interface information
*
* Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
@@ -460,7 +461,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
}
/**
- * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag
+ * batadv_mcast_forw_ipv6_node_get - get a node with an ipv6 flag
* @bat_priv: the bat priv with all the soft interface information
*
* Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
@@ -487,7 +488,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
}
/**
- * batadv_mcast_want_forw_ip_node_get - get a node with an ipv4/ipv6 flag
+ * batadv_mcast_forw_ip_node_get - get a node with an ipv4/ipv6 flag
* @bat_priv: the bat priv with all the soft interface information
* @ethhdr: an ethernet header to determine the protocol family from
*
@@ -511,7 +512,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag
+ * batadv_mcast_forw_unsnoop_node_get - get a node with an unsnoopable flag
* @bat_priv: the bat priv with all the soft interface information
*
* Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index b41719b64..678f06865 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -510,10 +510,10 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size)
* @node: node in the local table
* @data2: second object to compare the node to
*
- * Return: 1 if the two entry are the same, 0 otherwise
+ * Return: true if the two entry are the same, false otherwise
*/
-static int batadv_nc_hash_compare(const struct hlist_node *node,
- const void *data2)
+static bool batadv_nc_hash_compare(const struct hlist_node *node,
+ const void *data2)
{
const struct batadv_nc_path *nc_path1, *nc_path2;
@@ -521,15 +521,13 @@ static int batadv_nc_hash_compare(const struct hlist_node *node,
nc_path2 = data2;
/* Return 1 if the two keys are identical */
- if (memcmp(nc_path1->prev_hop, nc_path2->prev_hop,
- sizeof(nc_path1->prev_hop)) != 0)
- return 0;
+ if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop))
+ return false;
- if (memcmp(nc_path1->next_hop, nc_path2->next_hop,
- sizeof(nc_path1->next_hop)) != 0)
- return 0;
+ if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop))
+ return false;
- return 1;
+ return true;
}
/**
@@ -714,7 +712,7 @@ static void batadv_nc_worker(struct work_struct *work)
struct batadv_priv *bat_priv;
unsigned long timeout;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
priv_nc = container_of(delayed_work, struct batadv_priv_nc, work);
bat_priv = container_of(priv_nc, struct batadv_priv, nc);
@@ -793,10 +791,10 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
*
* Return: the nc_node if found, NULL otherwise.
*/
-static struct batadv_nc_node
-*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- bool in_coding)
+static struct batadv_nc_node *
+batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ bool in_coding)
{
struct batadv_nc_node *nc_node, *nc_node_out = NULL;
struct list_head *list;
@@ -835,11 +833,11 @@ static struct batadv_nc_node
*
* Return: the nc_node if found or created, NULL in case of an error.
*/
-static struct batadv_nc_node
-*batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- bool in_coding)
+static struct batadv_nc_node *
+batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ bool in_coding)
{
struct batadv_nc_node *nc_node;
spinlock_t *lock; /* Used to lock list selected by "int in_coding" */
@@ -856,8 +854,7 @@ static struct batadv_nc_node
if (!nc_node)
return NULL;
- if (!kref_get_unless_zero(&orig_neigh_node->refcount))
- goto free;
+ kref_get(&orig_neigh_node->refcount);
/* Initialize nc_node */
INIT_LIST_HEAD(&nc_node->list);
@@ -884,10 +881,6 @@ static struct batadv_nc_node
spin_unlock_bh(lock);
return nc_node;
-
-free:
- kfree(nc_node);
- return NULL;
}
/**
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index c355a8247..ab8c4f973 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -54,9 +54,9 @@ static void batadv_purge_orig(struct work_struct *work);
* @node: node in the local table
* @data2: second object to compare the node to
*
- * Return: 1 if they are the same originator
+ * Return: true if they are the same originator
*/
-int batadv_compare_orig(const struct hlist_node *node, const void *data2)
+bool batadv_compare_orig(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_orig_node,
hash_entry);
@@ -282,7 +282,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
}
/**
- * batadv_orig_node_get_router - router to the originator depending on iface
+ * batadv_orig_router_get - router to the originator depending on iface
* @orig_node: the orig node for the router
* @if_outgoing: the interface where the payload packet has been received or
* the OGM should be sent to
@@ -374,12 +374,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
if (!orig_ifinfo)
goto out;
- if (if_outgoing != BATADV_IF_DEFAULT &&
- !kref_get_unless_zero(&if_outgoing->refcount)) {
- kfree(orig_ifinfo);
- orig_ifinfo = NULL;
- goto out;
- }
+ if (if_outgoing != BATADV_IF_DEFAULT)
+ kref_get(&if_outgoing->refcount);
reset_time = jiffies - 1;
reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS);
@@ -455,11 +451,8 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
if (!neigh_ifinfo)
goto out;
- if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) {
- kfree(neigh_ifinfo);
- neigh_ifinfo = NULL;
- goto out;
- }
+ if (if_outgoing)
+ kref_get(&if_outgoing->refcount);
INIT_HLIST_NODE(&neigh_ifinfo->list);
kref_init(&neigh_ifinfo->refcount);
@@ -532,15 +525,11 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
if (hardif_neigh)
goto out;
- if (!kref_get_unless_zero(&hard_iface->refcount))
- goto out;
-
hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC);
- if (!hardif_neigh) {
- batadv_hardif_put(hard_iface);
+ if (!hardif_neigh)
goto out;
- }
+ kref_get(&hard_iface->refcount);
INIT_HLIST_NODE(&hardif_neigh->list);
ether_addr_copy(hardif_neigh->addr, neigh_addr);
hardif_neigh->if_incoming = hard_iface;
@@ -630,6 +619,8 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
struct batadv_neigh_node *neigh_node;
struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
if (neigh_node)
goto out;
@@ -643,16 +634,11 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
if (!neigh_node)
goto out;
- if (!kref_get_unless_zero(&hard_iface->refcount)) {
- kfree(neigh_node);
- neigh_node = NULL;
- goto out;
- }
-
INIT_HLIST_NODE(&neigh_node->list);
INIT_HLIST_HEAD(&neigh_node->ifinfo_list);
spin_lock_init(&neigh_node->ifinfo_lock);
+ kref_get(&hard_iface->refcount);
ether_addr_copy(neigh_node->addr, neigh_addr);
neigh_node->if_incoming = hard_iface;
neigh_node->orig_node = orig_node;
@@ -666,15 +652,15 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
kref_init(&neigh_node->refcount);
kref_get(&neigh_node->refcount);
- spin_lock_bh(&orig_node->neigh_list_lock);
hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
- spin_unlock_bh(&orig_node->neigh_list_lock);
batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
"Creating new neighbor %pM for orig_node %pM on interface %s\n",
neigh_addr, orig_node->orig, hard_iface->net_dev->name);
out:
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
if (hardif_neigh)
batadv_hardif_neigh_put(hardif_neigh);
return neigh_node;
@@ -779,6 +765,8 @@ static void batadv_orig_node_release(struct kref *ref)
struct batadv_neigh_node *neigh_node;
struct batadv_orig_node *orig_node;
struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_orig_node_vlan *vlan;
+ struct batadv_orig_ifinfo *last_candidate;
orig_node = container_of(ref, struct batadv_orig_node, refcount);
@@ -796,8 +784,21 @@ static void batadv_orig_node_release(struct kref *ref)
hlist_del_rcu(&orig_ifinfo->list);
batadv_orig_ifinfo_put(orig_ifinfo);
}
+
+ last_candidate = orig_node->last_bonding_candidate;
+ orig_node->last_bonding_candidate = NULL;
spin_unlock_bh(&orig_node->neigh_list_lock);
+ if (last_candidate)
+ batadv_orig_ifinfo_put(last_candidate);
+
+ spin_lock_bh(&orig_node->vlan_list_lock);
+ hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) {
+ hlist_del_rcu(&vlan->list);
+ batadv_orig_node_vlan_put(vlan);
+ }
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+
/* Free nc_nodes */
batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
@@ -1160,6 +1161,9 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
if (hard_iface->soft_iface != bat_priv->soft_iface)
continue;
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
best_neigh_node = batadv_find_best_neighbor(bat_priv,
orig_node,
hard_iface);
@@ -1167,6 +1171,8 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
best_neigh_node);
if (best_neigh_node)
batadv_neigh_node_put(best_neigh_node);
+
+ batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -1217,7 +1223,7 @@ static void batadv_purge_orig(struct work_struct *work)
struct delayed_work *delayed_work;
struct batadv_priv *bat_priv;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
_batadv_purge_orig(bat_priv);
queue_delayed_work(batadv_event_workqueue,
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 4e8b67f11..64a8951e5 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -33,7 +33,7 @@
struct seq_file;
-int batadv_compare_orig(const struct hlist_node *node, const void *data2);
+bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
int batadv_originator_init(struct batadv_priv *bat_priv);
void batadv_originator_free(struct batadv_priv *bat_priv);
void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 8a8d7ca1a..372128ddb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -175,6 +175,7 @@ enum batadv_bla_claimframe {
BATADV_CLAIM_TYPE_UNCLAIM = 0x01,
BATADV_CLAIM_TYPE_ANNOUNCE = 0x02,
BATADV_CLAIM_TYPE_REQUEST = 0x03,
+ BATADV_CLAIM_TYPE_LOOPDETECT = 0x04,
};
/**
@@ -501,7 +502,7 @@ struct batadv_coded_packet {
#pragma pack()
/**
- * struct batadv_unicast_tvlv - generic unicast packet with tvlv payload
+ * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
* @ttl: time to live for this packet, part of the genereal header
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index b781bf753..bfac086b4 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -100,10 +100,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
if (curr_router)
batadv_neigh_node_put(curr_router);
- /* increase refcount of new best neighbor */
- if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount))
- neigh_node = NULL;
-
spin_lock_bh(&orig_node->neigh_list_lock);
/* curr_router used earlier may not be the current orig_ifinfo->router
* anymore because it was dereferenced outside of the neigh_list_lock
@@ -114,6 +110,10 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
*/
curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
+ /* increase refcount of new best neighbor */
+ if (neigh_node)
+ kref_get(&neigh_node->refcount);
+
rcu_assign_pointer(orig_ifinfo->router, neigh_node);
spin_unlock_bh(&orig_node->neigh_list_lock);
batadv_orig_ifinfo_put(orig_ifinfo);
@@ -163,18 +163,18 @@ out:
* doesn't change otherwise.
*
* Return:
- * 0 if the packet is to be accepted.
- * 1 if the packet is to be ignored.
+ * false if the packet is to be accepted.
+ * true if the packet is to be ignored.
*/
-int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
- s32 seq_old_max_diff, unsigned long *last_reset,
- bool *protection_started)
+bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
+ s32 seq_old_max_diff, unsigned long *last_reset,
+ bool *protection_started)
{
if (seq_num_diff <= -seq_old_max_diff ||
seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) {
if (!batadv_has_timed_out(*last_reset,
BATADV_RESET_PROTECTION_MS))
- return 1;
+ return true;
*last_reset = jiffies;
if (protection_started)
@@ -183,7 +183,7 @@ int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
"old packet received, start protection\n");
}
- return 0;
+ return false;
}
bool batadv_check_management_packet(struct sk_buff *skb,
@@ -374,6 +374,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
if (skb_cow(skb, ETH_HLEN) < 0)
goto out;
+ ethhdr = eth_hdr(skb);
icmph = (struct batadv_icmp_header *)skb->data;
icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
@@ -455,6 +456,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
}
/**
+ * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
+ * @orig_node: originator node whose bonding candidates should be replaced
+ * @new_candidate: new bonding candidate or NULL
+ */
+static void
+batadv_last_bonding_replace(struct batadv_orig_node *orig_node,
+ struct batadv_orig_ifinfo *new_candidate)
+{
+ struct batadv_orig_ifinfo *old_candidate;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ old_candidate = orig_node->last_bonding_candidate;
+
+ if (new_candidate)
+ kref_get(&new_candidate->refcount);
+ orig_node->last_bonding_candidate = new_candidate;
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ if (old_candidate)
+ batadv_orig_ifinfo_put(old_candidate);
+}
+
+/**
* batadv_find_router - find a suitable router for this originator
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: the destination node
@@ -561,10 +585,6 @@ next:
}
rcu_read_unlock();
- /* last_bonding_candidate is reset below, remove the old reference. */
- if (orig_node->last_bonding_candidate)
- batadv_orig_ifinfo_put(orig_node->last_bonding_candidate);
-
/* After finding candidates, handle the three cases:
* 1) there is a next candidate, use that
* 2) there is no next candidate, use the first of the list
@@ -573,21 +593,28 @@ next:
if (next_candidate) {
batadv_neigh_node_put(router);
- /* remove references to first candidate, we don't need it. */
- if (first_candidate) {
- batadv_neigh_node_put(first_candidate_router);
- batadv_orig_ifinfo_put(first_candidate);
- }
+ kref_get(&next_candidate_router->refcount);
router = next_candidate_router;
- orig_node->last_bonding_candidate = next_candidate;
+ batadv_last_bonding_replace(orig_node, next_candidate);
} else if (first_candidate) {
batadv_neigh_node_put(router);
- /* refcounting has already been done in the loop above. */
+ kref_get(&first_candidate_router->refcount);
router = first_candidate_router;
- orig_node->last_bonding_candidate = first_candidate;
+ batadv_last_bonding_replace(orig_node, first_candidate);
} else {
- orig_node->last_bonding_candidate = NULL;
+ batadv_last_bonding_replace(orig_node, NULL);
+ }
+
+ /* cleanup of candidates */
+ if (first_candidate) {
+ batadv_neigh_node_put(first_candidate_router);
+ batadv_orig_ifinfo_put(first_candidate);
+ }
+
+ if (next_candidate) {
+ batadv_neigh_node_put(next_candidate_router);
+ batadv_orig_ifinfo_put(next_candidate);
}
return router;
@@ -601,6 +628,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr = eth_hdr(skb);
int res, hdr_len, ret = NET_RX_DROP;
+ unsigned int len;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -641,6 +669,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
if (hdr_len > 0)
batadv_skb_set_priority(skb, hdr_len);
+ len = skb->len;
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
/* translate transmit result into receive result */
@@ -648,7 +677,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
/* skb was transmitted and consumed */
batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
- skb->len + ETH_HLEN);
+ len + ETH_HLEN);
ret = NET_RX_SUCCESS;
} else if (res == NET_XMIT_POLICED) {
@@ -718,8 +747,9 @@ out:
return ret;
}
-static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
- struct sk_buff *skb, int hdr_len) {
+static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_len)
+{
struct batadv_unicast_packet *unicast_packet;
struct batadv_hard_iface *primary_if;
struct batadv_orig_node *orig_node;
@@ -730,11 +760,11 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
/* check if there is enough data before accessing it */
if (!pskb_may_pull(skb, hdr_len + ETH_HLEN))
- return 0;
+ return false;
/* create a copy of the skb (in case of for re-routing) to modify it. */
if (skb_cow(skb, sizeof(*unicast_packet)) < 0)
- return 0;
+ return false;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
vid = batadv_get_vid(skb, hdr_len);
@@ -758,7 +788,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
* table. If not, let the packet go untouched anyway because
* there is nothing the node can do
*/
- return 1;
+ return true;
}
/* retrieve the TTVN known by this node for the packet destination. This
@@ -774,7 +804,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
* not be possible to deliver it
*/
if (!orig_node)
- return 0;
+ return false;
curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
batadv_orig_node_put(orig_node);
@@ -785,7 +815,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
*/
is_old_ttvn = batadv_seq_before(unicast_packet->ttvn, curr_ttvn);
if (!is_old_ttvn)
- return 1;
+ return true;
old_ttvn = unicast_packet->ttvn;
/* the packet was forged based on outdated network information. Its
@@ -798,7 +828,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
"Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n",
unicast_packet->dest, ethhdr->h_dest,
old_ttvn, curr_ttvn);
- return 1;
+ return true;
}
/* the packet has not been re-routed: either the destination is
@@ -806,14 +836,14 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
* it is possible to drop the packet
*/
if (!batadv_is_my_client(bat_priv, ethhdr->h_dest, vid))
- return 0;
+ return false;
/* update the header in order to let the packet be delivered to this
* node's soft interface
*/
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
- return 0;
+ return false;
ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr);
@@ -821,7 +851,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
unicast_packet->ttvn = curr_ttvn;
- return 1;
+ return true;
}
/**
@@ -912,7 +942,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
hdr_size))
goto rx_success;
- batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size,
+ batadv_interface_rx(recv_if->soft_iface, skb, hdr_size,
orig_node);
rx_success:
@@ -1122,8 +1152,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
goto rx_success;
/* broadcast for me */
- batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size,
- orig_node);
+ batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node);
rx_success:
ret = NET_RX_SUCCESS;
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 02a5caa84..05c3ff42e 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -51,8 +51,8 @@ struct batadv_neigh_node *
batadv_find_router(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
-int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
- s32 seq_old_max_diff, unsigned long *last_reset,
- bool *protection_started);
+bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
+ s32 seq_old_max_diff, unsigned long *last_reset,
+ bool *protection_started);
#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 76417850d..010397650 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -26,6 +26,7 @@
#include <linux/if.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
@@ -423,8 +424,8 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_orig_node *orig_node;
orig_node = batadv_gw_get_selected_orig(bat_priv);
- return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0,
- orig_node, vid);
+ return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR,
+ BATADV_P_DATA, orig_node, vid);
}
void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface)
@@ -552,7 +553,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
delayed_work);
soft_iface = forw_packet->if_incoming->soft_iface;
@@ -577,10 +578,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
if (forw_packet->num_packets >= hard_iface->num_bcasts)
continue;
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
/* send a copy of the saved skb */
skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
if (skb1)
batadv_send_broadcast_skb(skb1, hard_iface);
+
+ batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -604,7 +610,7 @@ void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work)
struct batadv_forw_packet *forw_packet;
struct batadv_priv *bat_priv;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
delayed_work);
bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 8a136b6a1..287a3879e 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -186,7 +186,6 @@ static int batadv_interface_tx(struct sk_buff *skb,
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
struct batadv_hard_iface *primary_if = NULL;
struct batadv_bcast_packet *bcast_packet;
- __be16 ethertype = htons(ETH_P_BATMAN);
static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00,
0x00, 0x00};
static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00,
@@ -208,7 +207,7 @@ static int batadv_interface_tx(struct sk_buff *skb,
if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
goto dropped;
- soft_iface->trans_start = jiffies;
+ netif_trans_update(soft_iface);
vid = batadv_get_vid(skb, 0);
ethhdr = eth_hdr(skb);
@@ -216,7 +215,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
case ETH_P_8021Q:
vhdr = vlan_eth_hdr(skb);
- if (vhdr->h_vlan_encapsulated_proto != ethertype) {
+ /* drop batman-in-batman packets to prevent loops */
+ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) {
network_offset += VLAN_HLEN;
break;
}
@@ -381,13 +381,29 @@ end:
return NETDEV_TX_OK;
}
+/**
+ * batadv_interface_rx - receive ethernet frame on local batman-adv interface
+ * @soft_iface: local interface which will receive the ethernet frame
+ * @skb: ethernet frame for @soft_iface
+ * @hdr_size: size of already parsed batman-adv header
+ * @orig_node: originator from which the batman-adv packet was sent
+ *
+ * Sends a ethernet frame to the receive path of the local @soft_iface.
+ * skb->data has still point to the batman-adv header with the size @hdr_size.
+ * The caller has to have parsed this header already and made sure that at least
+ * @hdr_size bytes are still available for pull in @skb.
+ *
+ * The packet may still get dropped. This can happen when the encapsulated
+ * ethernet frame is invalid or contains again an batman-adv packet. Also
+ * unicast packets will be dropped directly when it was sent between two
+ * isolated clients.
+ */
void batadv_interface_rx(struct net_device *soft_iface,
- struct sk_buff *skb, struct batadv_hard_iface *recv_if,
- int hdr_size, struct batadv_orig_node *orig_node)
+ struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node)
{
struct batadv_bcast_packet *batadv_bcast_packet;
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
- __be16 ethertype = htons(ETH_P_BATMAN);
struct vlan_ethhdr *vhdr;
struct ethhdr *ethhdr;
unsigned short vid;
@@ -396,10 +412,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data;
is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST);
- /* check if enough space is available for pulling, and pull */
- if (!pskb_may_pull(skb, hdr_size))
- goto dropped;
-
skb_pull_rcsum(skb, hdr_size);
skb_reset_mac_header(skb);
@@ -421,7 +433,8 @@ void batadv_interface_rx(struct net_device *soft_iface,
vhdr = (struct vlan_ethhdr *)skb->data;
- if (vhdr->h_vlan_encapsulated_proto != ethertype)
+ /* drop batman-in-batman packets to prevent loops */
+ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN))
break;
/* fall through */
@@ -543,7 +556,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
}
/**
- * batadv_create_vlan - allocate the needed resources for a new vlan
+ * batadv_softif_create_vlan - allocate the needed resources for a new vlan
* @bat_priv: the bat priv with all the soft interface information
* @vid: the VLAN identifier
*
@@ -872,13 +885,14 @@ static int batadv_softif_slave_add(struct net_device *dev,
struct net_device *slave_dev)
{
struct batadv_hard_iface *hard_iface;
+ struct net *net = dev_net(dev);
int ret = -EINVAL;
hard_iface = batadv_hardif_get_by_netdev(slave_dev);
if (!hard_iface || hard_iface->soft_iface)
goto out;
- ret = batadv_hardif_enable_interface(hard_iface, dev->name);
+ ret = batadv_hardif_enable_interface(hard_iface, net, dev->name);
out:
if (hard_iface)
@@ -959,7 +973,7 @@ static void batadv_softif_init_early(struct net_device *dev)
dev->netdev_ops = &batadv_netdev_ops;
dev->destructor = batadv_softif_free;
- dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+ dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
dev->priv_flags |= IFF_NO_QUEUE;
/* can't call min_mtu, because the needed variables
@@ -975,7 +989,7 @@ static void batadv_softif_init_early(struct net_device *dev)
memset(priv, 0, sizeof(*priv));
}
-struct net_device *batadv_softif_create(const char *name)
+struct net_device *batadv_softif_create(struct net *net, const char *name)
{
struct net_device *soft_iface;
int ret;
@@ -985,6 +999,8 @@ struct net_device *batadv_softif_create(const char *name)
if (!soft_iface)
return NULL;
+ dev_net_set(soft_iface, net);
+
soft_iface->rtnl_link_ops = &batadv_link_ops;
ret = register_netdevice(soft_iface);
@@ -1017,7 +1033,9 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
struct list_head *head)
{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
struct batadv_hard_iface *hard_iface;
+ struct batadv_softif_vlan *vlan;
list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
if (hard_iface->soft_iface == soft_iface)
@@ -1025,16 +1043,23 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
BATADV_IF_CLEANUP_KEEP);
}
+ /* destroy the "untagged" VLAN */
+ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (vlan) {
+ batadv_softif_destroy_vlan(bat_priv, vlan);
+ batadv_softif_vlan_put(vlan);
+ }
+
batadv_sysfs_del_meshif(soft_iface);
unregister_netdevice_queue(soft_iface, head);
}
-int batadv_softif_is_valid(const struct net_device *net_dev)
+bool batadv_softif_is_valid(const struct net_device *net_dev)
{
if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx)
- return 1;
+ return true;
- return 0;
+ return false;
}
struct rtnl_link_ops batadv_link_ops __read_mostly = {
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 9ae265703..ec303ddbf 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -20,18 +20,20 @@
#include "main.h"
+#include <linux/types.h>
#include <net/rtnetlink.h>
struct net_device;
+struct net;
struct sk_buff;
int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
void batadv_interface_rx(struct net_device *soft_iface,
- struct sk_buff *skb, struct batadv_hard_iface *recv_if,
- int hdr_size, struct batadv_orig_node *orig_node);
-struct net_device *batadv_softif_create(const char *name);
+ struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node);
+struct net_device *batadv_softif_create(struct net *net, const char *name);
void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
-int batadv_softif_is_valid(const struct net_device *net_dev);
+bool batadv_softif_is_valid(const struct net_device *net_dev);
extern struct rtnl_link_ops batadv_link_ops;
int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan);
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index e7cf51333..414b20741 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -116,11 +116,13 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
static char *batadv_uev_action_str[] = {
"add",
"del",
- "change"
+ "change",
+ "loopdetect",
};
static char *batadv_uev_type_str[] = {
- "gw"
+ "gw",
+ "bla",
};
/* Use this, if you have customized show and store functions for vlan attrs */
@@ -830,6 +832,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
size_t count)
{
struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct net *net = dev_net(net_dev);
struct batadv_hard_iface *hard_iface;
int status_tmp = -1;
int ret = count;
@@ -873,7 +876,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
batadv_hardif_disable_interface(hard_iface,
BATADV_IF_CLEANUP_AUTO);
- ret = batadv_hardif_enable_interface(hard_iface, buff);
+ ret = batadv_hardif_enable_interface(hard_iface, net, buff);
unlock:
rtnl_unlock();
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 9b4551a86..57ec87f37 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -43,7 +43,6 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/workqueue.h>
-#include <net/net_namespace.h>
#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
@@ -76,9 +75,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
*
* Compare the MAC address and the VLAN ID of the two TT entries and check if
* they are the same TT client.
- * Return: 1 if the two TT clients are the same, 0 otherwise
+ * Return: true if the two TT clients are the same, false otherwise
*/
-static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
+static bool batadv_compare_tt(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_tt_common_entry,
hash_entry);
@@ -585,6 +584,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
struct batadv_tt_local_entry *tt_local;
struct batadv_tt_global_entry *tt_global = NULL;
+ struct net *net = dev_net(soft_iface);
struct batadv_softif_vlan *vlan;
struct net_device *in_dev = NULL;
struct hlist_head *head;
@@ -596,7 +596,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
u32 match_mark;
if (ifindex != BATADV_NULL_IFINDEX)
- in_dev = dev_get_by_index(&init_net, ifindex);
+ in_dev = dev_get_by_index(net, ifindex);
tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid);
@@ -650,8 +650,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
/* increase the refcounter of the related vlan */
vlan = batadv_softif_vlan_get(bat_priv, vid);
- if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d",
- addr, BATADV_PRINT_VID(vid))) {
+ if (!vlan) {
+ net_ratelimited_function(batadv_info, soft_iface,
+ "adding TT local entry %pM to non-existent VLAN %d\n",
+ addr, BATADV_PRINT_VID(vid));
kfree(tt_local);
tt_local = NULL;
goto out;
@@ -691,7 +693,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
if (unlikely(hash_added != 0)) {
/* remove the reference for the hash */
batadv_tt_local_entry_put(tt_local);
- batadv_softif_vlan_put(vlan);
goto out;
}
@@ -1010,8 +1011,8 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq,
"Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n",
net_dev->name, (u8)atomic_read(&bat_priv->tt.vn));
- seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID",
- "Flags", "Last seen", "CRC");
+ seq_puts(seq,
+ " Client VID Flags Last seen (CRC )\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -1680,9 +1681,8 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq,
"Globally announced TT entries received via the mesh %s\n",
net_dev->name);
- seq_printf(seq, " %-13s %s %s %-15s %s (%-10s) %s\n",
- "Client", "VID", "(TTVN)", "Originator", "(Curr TTVN)",
- "CRC", "Flags");
+ seq_puts(seq,
+ " Client VID (TTVN) Originator (Curr TTVN) (CRC ) Flags\n");
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -2270,6 +2270,29 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
return crc;
}
+/**
+ * batadv_tt_req_node_release - free tt_req node entry
+ * @ref: kref pointer of the tt req_node entry
+ */
+static void batadv_tt_req_node_release(struct kref *ref)
+{
+ struct batadv_tt_req_node *tt_req_node;
+
+ tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount);
+
+ kfree(tt_req_node);
+}
+
+/**
+ * batadv_tt_req_node_put - decrement the tt_req_node refcounter and
+ * possibly release it
+ * @tt_req_node: tt_req_node to be free'd
+ */
+static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node)
+{
+ kref_put(&tt_req_node->refcount, batadv_tt_req_node_release);
+}
+
static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
{
struct batadv_tt_req_node *node;
@@ -2279,7 +2302,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2316,7 +2339,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
if (batadv_has_timed_out(node->issued_at,
BATADV_TT_REQUEST_TIMEOUT)) {
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2348,9 +2371,11 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv,
if (!tt_req_node)
goto unlock;
+ kref_init(&tt_req_node->refcount);
ether_addr_copy(tt_req_node->addr, orig_node->orig);
tt_req_node->issued_at = jiffies;
+ kref_get(&tt_req_node->refcount);
hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list);
unlock:
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2362,19 +2387,19 @@ unlock:
* @entry_ptr: to be checked local tt entry
* @data_ptr: not used but definition required to satisfy the callback prototype
*
- * Return: 1 if the entry is a valid, 0 otherwise.
+ * Return: true if the entry is a valid, false otherwise.
*/
-static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr)
+static bool batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr)
{
const struct batadv_tt_common_entry *tt_common_entry = entry_ptr;
if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW)
- return 0;
- return 1;
+ return false;
+ return true;
}
-static int batadv_tt_global_valid(const void *entry_ptr,
- const void *data_ptr)
+static bool batadv_tt_global_valid(const void *entry_ptr,
+ const void *data_ptr)
{
const struct batadv_tt_common_entry *tt_common_entry = entry_ptr;
const struct batadv_tt_global_entry *tt_global_entry;
@@ -2382,7 +2407,7 @@ static int batadv_tt_global_valid(const void *entry_ptr,
if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM ||
tt_common_entry->flags & BATADV_TT_CLIENT_TEMP)
- return 0;
+ return false;
tt_global_entry = container_of(tt_common_entry,
struct batadv_tt_global_entry,
@@ -2404,7 +2429,8 @@ static int batadv_tt_global_valid(const void *entry_ptr,
static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
struct batadv_hashtable *hash,
void *tvlv_buff, u16 tt_len,
- int (*valid_cb)(const void *, const void *),
+ bool (*valid_cb)(const void *,
+ const void *),
void *cb_data)
{
struct batadv_tt_common_entry *tt_common_entry;
@@ -2553,11 +2579,11 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
*
* Return: true if the TT Request was sent, false otherwise
*/
-static int batadv_send_tt_request(struct batadv_priv *bat_priv,
- struct batadv_orig_node *dst_orig_node,
- u8 ttvn,
- struct batadv_tvlv_tt_vlan_data *tt_vlan,
- u16 num_vlan, bool full_table)
+static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *dst_orig_node,
+ u8 ttvn,
+ struct batadv_tvlv_tt_vlan_data *tt_vlan,
+ u16 num_vlan, bool full_table)
{
struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
struct batadv_tt_req_node *tt_req_node = NULL;
@@ -2613,13 +2639,19 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv,
out:
if (primary_if)
batadv_hardif_put(primary_if);
+
if (ret && tt_req_node) {
spin_lock_bh(&bat_priv->tt.req_list_lock);
- /* hlist_del_init() verifies tt_req_node still is in the list */
- hlist_del_init(&tt_req_node->list);
+ if (!hlist_unhashed(&tt_req_node->list)) {
+ hlist_del_init(&tt_req_node->list);
+ batadv_tt_req_node_put(tt_req_node);
+ }
spin_unlock_bh(&bat_priv->tt.req_list_lock);
- kfree(tt_req_node);
}
+
+ if (tt_req_node)
+ batadv_tt_req_node_put(tt_req_node);
+
kfree(tvlv_tt_data);
return ret;
}
@@ -3055,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
if (!batadv_compare_eth(node->addr, resp_src))
continue;
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -3201,7 +3233,7 @@ static void batadv_tt_purge(struct work_struct *work)
struct batadv_priv_tt *priv_tt;
struct batadv_priv *bat_priv;
- delayed_work = container_of(work, struct delayed_work, work);
+ delayed_work = to_delayed_work(work);
priv_tt = container_of(delayed_work, struct batadv_priv_tt, work);
bat_priv = container_of(priv_tt, struct batadv_priv, tt);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 1e47fbe8b..74d865a4d 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -330,7 +330,9 @@ struct batadv_orig_node {
DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
u32 last_bcast_seqno;
struct hlist_head neigh_list;
- /* neigh_list_lock protects: neigh_list and router */
+ /* neigh_list_lock protects: neigh_list, ifinfo_list,
+ * last_bonding_candidate and router
+ */
spinlock_t neigh_list_lock;
struct hlist_node hash_entry;
struct batadv_priv *bat_priv;
@@ -657,6 +659,9 @@ struct batadv_priv_tt {
* @num_requests: number of bla requests in flight
* @claim_hash: hash table containing mesh nodes this host has claimed
* @backbone_hash: hash table containing all detected backbone gateways
+ * @loopdetect_addr: MAC address used for own loopdetection frames
+ * @loopdetect_lasttime: time when the loopdetection frames were sent
+ * @loopdetect_next: how many periods to wait for the next loopdetect process
* @bcast_duplist: recently received broadcast packets array (for broadcast
* duplicate suppression)
* @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist
@@ -668,6 +673,9 @@ struct batadv_priv_bla {
atomic_t num_requests;
struct batadv_hashtable *claim_hash;
struct batadv_hashtable *backbone_hash;
+ u8 loopdetect_addr[ETH_ALEN];
+ unsigned long loopdetect_lasttime;
+ atomic_t loopdetect_next;
struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE];
int bcast_duplist_curr;
/* protects bcast_duplist & bcast_duplist_curr */
@@ -1012,6 +1020,7 @@ struct batadv_socket_packet {
* resolved
* @crc: crc16 checksum over all claims
* @crc_lock: lock protecting crc
+ * @report_work: work struct for reporting detected loops
* @refcount: number of contexts the object is used
* @rcu: struct used for freeing in an RCU-safe manner
*/
@@ -1025,6 +1034,7 @@ struct batadv_bla_backbone_gw {
atomic_t request_sent;
u16 crc;
spinlock_t crc_lock; /* protects crc */
+ struct work_struct report_work;
struct kref refcount;
struct rcu_head rcu;
};
@@ -1034,6 +1044,7 @@ struct batadv_bla_backbone_gw {
* @addr: mac address of claimed non-mesh client
* @vid: vlan id this client was detected on
* @backbone_gw: pointer to backbone gw claiming this client
+ * @backbone_lock: lock protecting backbone_gw pointer
* @lasttime: last time we heard of claim (locals only)
* @hash_entry: hlist node for batadv_priv_bla::claim_hash
* @refcount: number of contexts the object is used
@@ -1043,6 +1054,7 @@ struct batadv_bla_claim {
u8 addr[ETH_ALEN];
unsigned short vid;
struct batadv_bla_backbone_gw *backbone_gw;
+ spinlock_t backbone_lock; /* protects backbone_gw */
unsigned long lasttime;
struct hlist_node hash_entry;
struct rcu_head rcu;
@@ -1129,11 +1141,13 @@ struct batadv_tt_change_node {
* struct batadv_tt_req_node - data to keep track of the tt requests in flight
* @addr: mac address address of the originator this request was sent to
* @issued_at: timestamp used for purging stale tt requests
+ * @refcount: number of contexts the object is used by
* @list: list node for batadv_priv_tt::req_list
*/
struct batadv_tt_req_node {
u8 addr[ETH_ALEN];
unsigned long issued_at;
+ struct kref refcount;
struct hlist_node list;
};
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index e45cb9155..780089d75 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -68,7 +68,7 @@ struct lowpan_peer {
struct in6_addr peer_addr;
};
-struct lowpan_dev {
+struct lowpan_btle_dev {
struct list_head list;
struct hci_dev *hdev;
@@ -80,18 +80,21 @@ struct lowpan_dev {
struct delayed_work notify_peers;
};
-static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev)
+static inline struct lowpan_btle_dev *
+lowpan_btle_dev(const struct net_device *netdev)
{
- return (struct lowpan_dev *)lowpan_priv(netdev)->priv;
+ return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv;
}
-static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer)
+static inline void peer_add(struct lowpan_btle_dev *dev,
+ struct lowpan_peer *peer)
{
list_add_rcu(&peer->list, &dev->peers);
atomic_inc(&dev->peer_count);
}
-static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer)
+static inline bool peer_del(struct lowpan_btle_dev *dev,
+ struct lowpan_peer *peer)
{
list_del_rcu(&peer->list);
kfree_rcu(peer, rcu);
@@ -106,7 +109,7 @@ static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer)
return false;
}
-static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev,
+static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_btle_dev *dev,
bdaddr_t *ba, __u8 type)
{
struct lowpan_peer *peer;
@@ -134,8 +137,8 @@ static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev,
return NULL;
}
-static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev,
- struct l2cap_chan *chan)
+static inline struct lowpan_peer *
+__peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan)
{
struct lowpan_peer *peer;
@@ -147,8 +150,8 @@ static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev,
return NULL;
}
-static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev,
- struct l2cap_conn *conn)
+static inline struct lowpan_peer *
+__peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn)
{
struct lowpan_peer *peer;
@@ -160,7 +163,7 @@ static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev,
return NULL;
}
-static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev,
+static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
struct in6_addr *daddr,
struct sk_buff *skb)
{
@@ -220,7 +223,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev,
static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn)
{
- struct lowpan_dev *entry;
+ struct lowpan_btle_dev *entry;
struct lowpan_peer *peer = NULL;
rcu_read_lock();
@@ -236,10 +239,10 @@ static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn)
return peer;
}
-static struct lowpan_dev *lookup_dev(struct l2cap_conn *conn)
+static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn)
{
- struct lowpan_dev *entry;
- struct lowpan_dev *dev = NULL;
+ struct lowpan_btle_dev *entry;
+ struct lowpan_btle_dev *dev = NULL;
rcu_read_lock();
@@ -270,10 +273,10 @@ static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
struct l2cap_chan *chan)
{
const u8 *saddr, *daddr;
- struct lowpan_dev *dev;
+ struct lowpan_btle_dev *dev;
struct lowpan_peer *peer;
- dev = lowpan_dev(netdev);
+ dev = lowpan_btle_dev(netdev);
rcu_read_lock();
peer = __peer_lookup_chan(dev, chan);
@@ -375,7 +378,7 @@ drop:
/* Packet from BT LE device */
static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
{
- struct lowpan_dev *dev;
+ struct lowpan_btle_dev *dev;
struct lowpan_peer *peer;
int err;
@@ -432,7 +435,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev,
{
struct in6_addr ipv6_daddr;
struct ipv6hdr *hdr;
- struct lowpan_dev *dev;
+ struct lowpan_btle_dev *dev;
struct lowpan_peer *peer;
bdaddr_t addr, *any = BDADDR_ANY;
u8 *daddr = any->b;
@@ -440,7 +443,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev,
hdr = ipv6_hdr(skb);
- dev = lowpan_dev(netdev);
+ dev = lowpan_btle_dev(netdev);
memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr));
@@ -540,19 +543,19 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev)
{
struct sk_buff *local_skb;
- struct lowpan_dev *entry;
+ struct lowpan_btle_dev *entry;
int err = 0;
rcu_read_lock();
list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
struct lowpan_peer *pentry;
- struct lowpan_dev *dev;
+ struct lowpan_btle_dev *dev;
if (entry->netdev != netdev)
continue;
- dev = lowpan_dev(entry->netdev);
+ dev = lowpan_btle_dev(entry->netdev);
list_for_each_entry_rcu(pentry, &dev->peers, list) {
int ret;
@@ -720,8 +723,8 @@ static void ifdown(struct net_device *netdev)
static void do_notify_peers(struct work_struct *work)
{
- struct lowpan_dev *dev = container_of(work, struct lowpan_dev,
- notify_peers.work);
+ struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev,
+ notify_peers.work);
netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */
}
@@ -763,7 +766,7 @@ static void set_ip_addr_bits(u8 addr_type, u8 *addr)
}
static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
- struct lowpan_dev *dev)
+ struct lowpan_btle_dev *dev)
{
struct lowpan_peer *peer;
@@ -800,12 +803,12 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
return peer->chan;
}
-static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
+static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev)
{
struct net_device *netdev;
int err = 0;
- netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)),
+ netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)),
IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN,
netdev_setup);
if (!netdev)
@@ -817,7 +820,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev);
SET_NETDEV_DEVTYPE(netdev, &bt_type);
- *dev = lowpan_dev(netdev);
+ *dev = lowpan_btle_dev(netdev);
(*dev)->netdev = netdev;
(*dev)->hdev = chan->conn->hcon->hdev;
INIT_LIST_HEAD(&(*dev)->peers);
@@ -850,7 +853,7 @@ out:
static inline void chan_ready_cb(struct l2cap_chan *chan)
{
- struct lowpan_dev *dev;
+ struct lowpan_btle_dev *dev;
dev = lookup_dev(chan->conn);
@@ -887,8 +890,9 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan)
static void delete_netdev(struct work_struct *work)
{
- struct lowpan_dev *entry = container_of(work, struct lowpan_dev,
- delete_netdev);
+ struct lowpan_btle_dev *entry = container_of(work,
+ struct lowpan_btle_dev,
+ delete_netdev);
lowpan_unregister_netdev(entry->netdev);
@@ -897,8 +901,8 @@ static void delete_netdev(struct work_struct *work)
static void chan_close_cb(struct l2cap_chan *chan)
{
- struct lowpan_dev *entry;
- struct lowpan_dev *dev = NULL;
+ struct lowpan_btle_dev *entry;
+ struct lowpan_btle_dev *dev = NULL;
struct lowpan_peer *peer;
int err = -ENOENT;
bool last = false, remove = true;
@@ -918,7 +922,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
spin_lock(&devices_lock);
list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
- dev = lowpan_dev(entry->netdev);
+ dev = lowpan_btle_dev(entry->netdev);
peer = __peer_lookup_chan(dev, chan);
if (peer) {
last = peer_del(dev, peer);
@@ -1128,7 +1132,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
static void disconnect_all_peers(void)
{
- struct lowpan_dev *entry;
+ struct lowpan_btle_dev *entry;
struct lowpan_peer *peer, *tmp_peer, *new_peer;
struct list_head peers;
@@ -1288,7 +1292,7 @@ static ssize_t lowpan_control_write(struct file *fp,
static int lowpan_control_show(struct seq_file *f, void *ptr)
{
- struct lowpan_dev *entry;
+ struct lowpan_btle_dev *entry;
struct lowpan_peer *peer;
spin_lock(&devices_lock);
@@ -1319,7 +1323,7 @@ static const struct file_operations lowpan_control_fops = {
static void disconnect_devices(void)
{
- struct lowpan_dev *entry, *tmp, *new_dev;
+ struct lowpan_btle_dev *entry, *tmp, *new_dev;
struct list_head devices;
INIT_LIST_HEAD(&devices);
@@ -1357,7 +1361,7 @@ static int device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- struct lowpan_dev *entry;
+ struct lowpan_btle_dev *entry;
if (netdev->type != ARPHRD_6LOWPAN)
return NOTIFY_DONE;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 955eda93e..3df7aefb7 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -65,7 +65,7 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
void bt_sock_reclassify_lock(struct sock *sk, int proto)
{
BUG_ON(!sk);
- BUG_ON(sock_owned_by_user(sk));
+ BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk,
bt_slock_key_strings[proto], &bt_slock_key[proto],
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index 6ceb5d36a..f4fcb4a9d 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -188,7 +188,7 @@ static netdev_tx_t bnep_net_xmit(struct sk_buff *skb,
* So we have to queue them and wake up session thread which is sleeping
* on the sk_sleep(sk).
*/
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
skb_queue_tail(&sk->sk_write_queue, skb);
wake_up_interruptible(sk_sleep(sk));
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2713fc86e..45a9fc68c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3139,10 +3139,10 @@ void hci_unregister_dev(struct hci_dev *hdev)
list_del(&hdev->list);
write_unlock(&hci_dev_list_lock);
- hci_dev_do_close(hdev);
-
cancel_work_sync(&hdev->power_on);
+ hci_dev_do_close(hdev);
+
if (!test_bit(HCI_INIT, &hdev->flags) &&
!hci_dev_test_flag(hdev, HCI_SETUP) &&
!hci_dev_test_flag(hdev, HCI_CONFIG)) {
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c162af5d1..d4b3dd541 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4727,6 +4727,19 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
u32 flags;
u8 *ptr, real_len;
+ switch (type) {
+ case LE_ADV_IND:
+ case LE_ADV_DIRECT_IND:
+ case LE_ADV_SCAN_IND:
+ case LE_ADV_NONCONN_IND:
+ case LE_ADV_SCAN_RSP:
+ break;
+ default:
+ BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x",
+ type);
+ return;
+ }
+
/* Find the end of the data in case the report contains padded zero
* bytes at the end causing an invalid length value.
*
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 6e125d76d..c045b3c54 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1065,6 +1065,9 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
flags |= LE_AD_LIMITED;
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
/* If a discovery flag wasn't provided, simply use the global
* settings.
@@ -1072,9 +1075,6 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
if (!flags)
flags |= mgmt_get_adv_discov_flags(hdev);
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- flags |= LE_AD_NO_BREDR;
-
/* If flags would still be empty, then there is no need to
* include the "Flags" AD field".
*/
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index e4cae7289..388ee8b59 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -778,7 +778,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
}
if (sec.level < BT_SECURITY_LOW ||
- sec.level > BT_SECURITY_HIGH) {
+ sec.level > BT_SECURITY_FIPS) {
err = -EINVAL;
break;
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 160797722..28d5ec269 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -213,8 +213,7 @@ drop:
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
-/* note: already called with rcu_read_lock */
-static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static void __br_handle_local_finish(struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
u16 vid = 0;
@@ -222,6 +221,14 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu
/* check if vlan is allowed, to avoid spoofing */
if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+}
+
+/* note: already called with rcu_read_lock */
+static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+
+ __br_handle_local_finish(skb);
BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
br_pass_frame_up(skb);
@@ -274,11 +281,21 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
if (p->br->stp_enabled == BR_NO_STP ||
fwd_mask & (1u << dest[5]))
goto forward;
- break;
+ *pskb = skb;
+ __br_handle_local_finish(skb);
+ return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
+ case 0x0E: /* 802.1AB LLDP */
+ fwd_mask |= p->br->group_fwd_mask;
+ if (fwd_mask & (1u << dest[5]))
+ goto forward;
+ *pskb = skb;
+ __br_handle_local_finish(skb);
+ return RX_HANDLER_PASS;
+
default:
/* Allow selective forwarding for most other protocols */
fwd_mask |= p->br->group_fwd_mask;
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 60a3dbfca..d99b20097 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -113,7 +113,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p = NULL;
unsigned long args[4];
+ int ret = -EOPNOTSUPP;
if (copy_from_user(args, rq->ifr_data, sizeof(args)))
return -EFAULT;
@@ -183,25 +185,29 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- return br_set_forward_delay(br, args[1]);
+ ret = br_set_forward_delay(br, args[1]);
+ break;
case BRCTL_SET_BRIDGE_HELLO_TIME:
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- return br_set_hello_time(br, args[1]);
+ ret = br_set_hello_time(br, args[1]);
+ break;
case BRCTL_SET_BRIDGE_MAX_AGE:
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- return br_set_max_age(br, args[1]);
+ ret = br_set_max_age(br, args[1]);
+ break;
case BRCTL_SET_AGEING_TIME:
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- return br_set_ageing_time(br, args[1]);
+ ret = br_set_ageing_time(br, args[1]);
+ break;
case BRCTL_GET_PORT_INFO:
{
@@ -241,20 +247,19 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
return -EPERM;
br_stp_set_enabled(br, args[1]);
- return 0;
+ ret = 0;
+ break;
case BRCTL_SET_BRIDGE_PRIORITY:
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
br_stp_set_bridge_priority(br, args[1]);
- return 0;
+ ret = 0;
+ break;
case BRCTL_SET_PORT_PRIORITY:
{
- struct net_bridge_port *p;
- int ret;
-
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -264,14 +269,11 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
else
ret = br_stp_set_port_priority(p, args[2]);
spin_unlock_bh(&br->lock);
- return ret;
+ break;
}
case BRCTL_SET_PATH_COST:
{
- struct net_bridge_port *p;
- int ret;
-
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -281,8 +283,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
else
ret = br_stp_set_path_cost(p, args[2]);
spin_unlock_bh(&br->lock);
-
- return ret;
+ break;
}
case BRCTL_GET_FDB_ENTRIES:
@@ -290,7 +291,14 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
args[2], args[3]);
}
- return -EOPNOTSUPP;
+ if (!ret) {
+ if (p)
+ br_ifinfo_notify(RTM_NEWLINK, p);
+ else
+ netdev_state_change(br->dev);
+ }
+
+ return ret;
}
static int old_deviceless(struct net *net, void __user *uarg)
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 44114a94c..77e7f69bf 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -217,13 +217,13 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -236,7 +236,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
return 0;
inhdr_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
return -1;
}
@@ -700,7 +700,7 @@ static int
br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
- unsigned int mtu = ip_skb_dst_mtu(skb);
+ unsigned int mtu = ip_skb_dst_mtu(sk, skb);
struct iphdr *iph = ip_hdr(skb);
if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index d61f56efc..5e59a8457 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -122,13 +122,13 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
if (pkt_len + ip6h_len > skb->len) {
- IP6_INC_STATS_BH(net, idev,
- IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
- IP6_INC_STATS_BH(net, idev,
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INDISCARDS);
goto drop;
}
}
@@ -142,7 +142,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
return 0;
inhdr_error:
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
drop:
return -1;
}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e9c635eae..85e89f693 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -135,9 +135,9 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */
+ nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */
+ nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */
- + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */
- + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */
- + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */
+ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */
+ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */
+ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */
#endif
@@ -190,13 +190,16 @@ static int br_port_fill_attrs(struct sk_buff *skb,
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
- if (nla_put_u64(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval))
+ if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval,
+ IFLA_BRPORT_PAD))
return -EMSGSIZE;
timerval = br_timer_value(&p->forward_delay_timer);
- if (nla_put_u64(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval))
+ if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval,
+ IFLA_BRPORT_PAD))
return -EMSGSIZE;
timerval = br_timer_value(&p->hold_timer);
- if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval))
+ if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval,
+ IFLA_BRPORT_PAD))
return -EMSGSIZE;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -847,6 +850,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 },
[IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 },
[IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
+ [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -918,6 +922,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (err)
return err;
}
+
+ if (data[IFLA_BR_VLAN_STATS_ENABLED]) {
+ __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]);
+
+ err = br_vlan_set_stats(br, vlan_stats);
+ if (err)
+ return err;
+ }
#endif
if (data[IFLA_BR_GROUP_FWD_MASK]) {
@@ -1079,6 +1091,7 @@ static size_t br_get_size(const struct net_device *brdev)
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */
nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */
#endif
nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */
nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */
@@ -1087,10 +1100,10 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */
nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */
nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */
nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */
@@ -1101,12 +1114,12 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
- nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1129,16 +1142,17 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
u64 clockval;
clockval = br_timer_value(&br->hello_timer);
- if (nla_put_u64(skb, IFLA_BR_HELLO_TIMER, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD))
return -EMSGSIZE;
clockval = br_timer_value(&br->tcn_timer);
- if (nla_put_u64(skb, IFLA_BR_TCN_TIMER, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD))
return -EMSGSIZE;
clockval = br_timer_value(&br->topology_change_timer);
- if (nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
clockval = br_timer_value(&br->gc_timer);
- if (nla_put_u64(skb, IFLA_BR_GC_TIMER, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
return -EMSGSIZE;
if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
@@ -1163,7 +1177,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
- nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid))
+ nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled))
return -EMSGSIZE;
#endif
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -1182,22 +1197,28 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
return -EMSGSIZE;
clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
- if (nla_put_u64(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
clockval = jiffies_to_clock_t(br->multicast_membership_interval);
- if (nla_put_u64(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
clockval = jiffies_to_clock_t(br->multicast_querier_interval);
- if (nla_put_u64(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
clockval = jiffies_to_clock_t(br->multicast_query_interval);
- if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
clockval = jiffies_to_clock_t(br->multicast_query_response_interval);
- if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
clockval = jiffies_to_clock_t(br->multicast_startup_query_interval);
- if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval))
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval,
+ IFLA_BR_PAD))
return -EMSGSIZE;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -1213,6 +1234,69 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
return 0;
}
+static size_t br_get_linkxstats_size(const struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ int numvls = 0;
+
+ vg = br_vlan_group(br);
+ if (!vg)
+ return 0;
+
+ /* we need to count all, even placeholder entries */
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ numvls++;
+
+ /* account for the vlans and the link xstats type nest attribute */
+ return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) +
+ nla_total_size(0);
+}
+
+static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
+ int *prividx)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct nlattr *nest;
+ int vl_idx = 0;
+
+ vg = br_vlan_group(br);
+ if (!vg)
+ goto out;
+ nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
+ if (!nest)
+ return -EMSGSIZE;
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct bridge_vlan_xstats vxi;
+ struct br_vlan_stats stats;
+
+ if (++vl_idx < *prividx)
+ continue;
+ memset(&vxi, 0, sizeof(vxi));
+ vxi.vid = v->vid;
+ br_vlan_get_stats(v, &stats);
+ vxi.rx_bytes = stats.rx_bytes;
+ vxi.rx_packets = stats.rx_packets;
+ vxi.tx_bytes = stats.tx_bytes;
+ vxi.tx_packets = stats.tx_packets;
+
+ if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ *prividx = 0;
+out:
+ return 0;
+
+nla_put_failure:
+ nla_nest_end(skb, nest);
+ *prividx = vl_idx;
+
+ return -EMSGSIZE;
+}
static struct rtnl_af_ops br_af_ops __read_mostly = {
.family = AF_BRIDGE,
@@ -1231,6 +1315,8 @@ struct rtnl_link_ops br_link_ops __read_mostly = {
.dellink = br_dev_delete,
.get_size = br_get_size,
.fill_info = br_fill_info,
+ .fill_linkxstats = br_fill_linkxstats,
+ .get_linkxstats_size = br_get_linkxstats_size,
.slave_maxtype = IFLA_BRPORT_MAX,
.slave_policy = br_port_policy,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index f516c53ba..52edecf3c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -77,12 +77,21 @@ struct bridge_mcast_querier {
};
#endif
+struct br_vlan_stats {
+ u64 rx_bytes;
+ u64 rx_packets;
+ u64 tx_bytes;
+ u64 tx_packets;
+ struct u64_stats_sync syncp;
+};
+
/**
* struct net_bridge_vlan - per-vlan entry
*
* @vnode: rhashtable member
* @vid: VLAN id
* @flags: bridge vlan flags
+ * @stats: per-cpu VLAN statistics
* @br: if MASTER flag set, this points to a bridge struct
* @port: if MASTER flag unset, this points to a port struct
* @refcnt: if MASTER flag set, this is bumped for each port referencing it
@@ -100,6 +109,7 @@ struct net_bridge_vlan {
struct rhash_head vnode;
u16 vid;
u16 flags;
+ struct br_vlan_stats __percpu *stats;
union {
struct net_bridge *br;
struct net_bridge_port *port;
@@ -343,6 +353,7 @@ struct net_bridge
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_bridge_vlan_group __rcu *vlgrp;
u8 vlan_enabled;
+ u8 vlan_stats_enabled;
__be16 vlan_proto;
u16 default_pvid;
#endif
@@ -706,6 +717,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
+int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
@@ -714,6 +726,8 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
void nbp_vlan_flush(struct net_bridge_port *port);
int nbp_vlan_init(struct net_bridge_port *port);
int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
+void br_vlan_get_stats(const struct net_bridge_vlan *v,
+ struct br_vlan_stats *stats);
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
@@ -896,6 +910,10 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
return NULL;
}
+static inline void br_vlan_get_stats(const struct net_bridge_vlan *v,
+ struct br_vlan_stats *stats)
+{
+}
#endif
struct nf_br_ops {
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 6b8091407..beb47071e 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -43,7 +43,14 @@ static ssize_t store_bridge_parm(struct device *d,
if (endp == buf)
return -EINVAL;
+ if (!rtnl_trylock())
+ return restart_syscall();
+
err = (*set)(br, val);
+ if (!err)
+ netdev_state_change(br->dev);
+ rtnl_unlock();
+
return err ? err : len;
}
@@ -101,15 +108,7 @@ static ssize_t ageing_time_show(struct device *d,
static int set_ageing_time(struct net_bridge *br, unsigned long val)
{
- int ret;
-
- if (!rtnl_trylock())
- return restart_syscall();
-
- ret = br_set_ageing_time(br, val);
- rtnl_unlock();
-
- return ret;
+ return br_set_ageing_time(br, val);
}
static ssize_t ageing_time_store(struct device *d,
@@ -128,27 +127,18 @@ static ssize_t stp_state_show(struct device *d,
}
+static int set_stp_state(struct net_bridge *br, unsigned long val)
+{
+ br_stp_set_enabled(br, val);
+
+ return 0;
+}
+
static ssize_t stp_state_store(struct device *d,
struct device_attribute *attr, const char *buf,
size_t len)
{
- struct net_bridge *br = to_bridge(d);
- char *endp;
- unsigned long val;
-
- if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- val = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
-
- if (!rtnl_trylock())
- return restart_syscall();
- br_stp_set_enabled(br, val);
- rtnl_unlock();
-
- return len;
+ return store_bridge_parm(d, buf, len, set_stp_state);
}
static DEVICE_ATTR_RW(stp_state);
@@ -160,29 +150,22 @@ static ssize_t group_fwd_mask_show(struct device *d,
return sprintf(buf, "%#x\n", br->group_fwd_mask);
}
-
-static ssize_t group_fwd_mask_store(struct device *d,
- struct device_attribute *attr,
- const char *buf,
- size_t len)
+static int set_group_fwd_mask(struct net_bridge *br, unsigned long val)
{
- struct net_bridge *br = to_bridge(d);
- char *endp;
- unsigned long val;
-
- if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- val = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
-
if (val & BR_GROUPFWD_RESTRICTED)
return -EINVAL;
br->group_fwd_mask = val;
- return len;
+ return 0;
+}
+
+static ssize_t group_fwd_mask_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_group_fwd_mask);
}
static DEVICE_ATTR_RW(group_fwd_mask);
@@ -328,6 +311,7 @@ static ssize_t group_addr_store(struct device *d,
br->group_addr_set = true;
br_recalculate_fwd_mask(br);
+ netdev_state_change(br->dev);
rtnl_unlock();
@@ -336,17 +320,17 @@ static ssize_t group_addr_store(struct device *d,
static DEVICE_ATTR_RW(group_addr);
+static int set_flush(struct net_bridge *br, unsigned long val)
+{
+ br_fdb_flush(br);
+ return 0;
+}
+
static ssize_t flush_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- struct net_bridge *br = to_bridge(d);
-
- if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- br_fdb_flush(br);
- return len;
+ return store_bridge_parm(d, buf, len, set_flush);
}
static DEVICE_ATTR_WO(flush);
@@ -747,6 +731,22 @@ static ssize_t default_pvid_store(struct device *d,
return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid);
}
static DEVICE_ATTR_RW(default_pvid);
+
+static ssize_t vlan_stats_enabled_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->vlan_stats_enabled);
+}
+
+static ssize_t vlan_stats_enabled_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_set_stats);
+}
+static DEVICE_ATTR_RW(vlan_stats_enabled);
#endif
static struct attribute *bridge_attrs[] = {
@@ -794,6 +794,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_vlan_filtering.attr,
&dev_attr_vlan_protocol.attr,
&dev_attr_default_pvid.attr,
+ &dev_attr_vlan_stats_enabled.attr,
#endif
NULL
};
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index efe415ad8..1e04d4d44 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -61,7 +61,6 @@ static int store_flag(struct net_bridge_port *p, unsigned long v,
if (flags != p->flags) {
p->flags = flags;
br_port_flags_change(p, mask);
- br_ifinfo_notify(RTM_NEWLINK, p);
}
return 0;
}
@@ -253,8 +252,10 @@ static ssize_t brport_store(struct kobject *kobj,
spin_lock_bh(&p->br->lock);
ret = brport_attr->store(p, val);
spin_unlock_bh(&p->br->lock);
- if (ret == 0)
+ if (!ret) {
+ br_ifinfo_notify(RTM_NEWLINK, p);
ret = count;
+ }
}
rtnl_unlock();
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9309bb4f2..b6de4f457 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -162,6 +162,17 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid
return masterv;
}
+static void br_master_vlan_rcu_free(struct rcu_head *rcu)
+{
+ struct net_bridge_vlan *v;
+
+ v = container_of(rcu, struct net_bridge_vlan, rcu);
+ WARN_ON(!br_vlan_is_master(v));
+ free_percpu(v->stats);
+ v->stats = NULL;
+ kfree(v);
+}
+
static void br_vlan_put_master(struct net_bridge_vlan *masterv)
{
struct net_bridge_vlan_group *vg;
@@ -174,7 +185,7 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv)
rhashtable_remove_fast(&vg->vlan_hash,
&masterv->vnode, br_vlan_rht_params);
__vlan_del_list(masterv);
- kfree_rcu(masterv, rcu);
+ call_rcu(&masterv->rcu, br_master_vlan_rcu_free);
}
}
@@ -230,6 +241,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
if (!masterv)
goto out_filt;
v->brvlan = masterv;
+ v->stats = masterv->stats;
}
/* Add the dev mac and count the vlan only if it's usable */
@@ -329,6 +341,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
+ struct br_vlan_stats *stats;
struct net_bridge_vlan *v;
u16 vid;
@@ -355,18 +368,27 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
return NULL;
}
}
+ if (br->vlan_stats_enabled) {
+ stats = this_cpu_ptr(v->stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->tx_bytes += skb->len;
+ stats->tx_packets++;
+ u64_stats_update_end(&stats->syncp);
+ }
+
if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
skb->vlan_tci = 0;
-
out:
return skb;
}
/* Called under RCU */
-static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto,
+static bool __allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
struct sk_buff *skb, u16 *vid)
{
- const struct net_bridge_vlan *v;
+ struct br_vlan_stats *stats;
+ struct net_bridge_vlan *v;
bool tagged;
BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
@@ -375,7 +397,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto,
* HW accelerated vlan tag.
*/
if (unlikely(!skb_vlan_tag_present(skb) &&
- skb->protocol == proto)) {
+ skb->protocol == br->vlan_proto)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
return false;
@@ -383,7 +405,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto,
if (!br_vlan_get_tag(skb, vid)) {
/* Tagged frame */
- if (skb->vlan_proto != proto) {
+ if (skb->vlan_proto != br->vlan_proto) {
/* Protocol-mismatch, empty out vlan_tci for new tag */
skb_push(skb, ETH_HLEN);
skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
@@ -419,7 +441,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto,
*vid = pvid;
if (likely(!tagged))
/* Untagged Frame. */
- __vlan_hwaccel_put_tag(skb, proto, pvid);
+ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
else
/* Priority-tagged Frame.
* At this point, We know that skb->vlan_tci had
@@ -428,13 +450,24 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto,
*/
skb->vlan_tci |= pvid;
- return true;
+ /* if stats are disabled we can avoid the lookup */
+ if (!br->vlan_stats_enabled)
+ return true;
}
-
- /* Frame had a valid vlan tag. See if vlan is allowed */
v = br_vlan_find(vg, *vid);
- if (v && br_vlan_should_use(v))
- return true;
+ if (!v || !br_vlan_should_use(v))
+ goto drop;
+
+ if (br->vlan_stats_enabled) {
+ stats = this_cpu_ptr(v->stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_bytes += skb->len;
+ stats->rx_packets++;
+ u64_stats_update_end(&stats->syncp);
+ }
+
+ return true;
+
drop:
kfree_skb(skb);
return false;
@@ -452,7 +485,7 @@ bool br_allowed_ingress(const struct net_bridge *br,
return true;
}
- return __allowed_ingress(vg, br->vlan_proto, skb, vid);
+ return __allowed_ingress(br, vg, skb, vid);
}
/* Called under RCU. */
@@ -542,6 +575,11 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags)
if (!vlan)
return -ENOMEM;
+ vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
+ if (!vlan->stats) {
+ kfree(vlan);
+ return -ENOMEM;
+ }
vlan->vid = vid;
vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER;
vlan->flags &= ~BRIDGE_VLAN_INFO_PVID;
@@ -549,8 +587,10 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags)
if (flags & BRIDGE_VLAN_INFO_BRENTRY)
atomic_set(&vlan->refcnt, 1);
ret = __vlan_add(vlan, flags);
- if (ret)
+ if (ret) {
+ free_percpu(vlan->stats);
kfree(vlan);
+ }
return ret;
}
@@ -651,15 +691,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
{
- int err;
-
- if (!rtnl_trylock())
- return restart_syscall();
-
- err = __br_vlan_filter_toggle(br, val);
- rtnl_unlock();
-
- return err;
+ return __br_vlan_filter_toggle(br, val);
}
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
@@ -713,18 +745,24 @@ err_filt:
int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
{
- int err;
-
if (val != ETH_P_8021Q && val != ETH_P_8021AD)
return -EPROTONOSUPPORT;
- if (!rtnl_trylock())
- return restart_syscall();
+ return __br_vlan_set_proto(br, htons(val));
+}
- err = __br_vlan_set_proto(br, htons(val));
- rtnl_unlock();
+int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
+{
+ switch (val) {
+ case 0:
+ case 1:
+ br->vlan_stats_enabled = val;
+ break;
+ default:
+ return -EINVAL;
+ }
- return err;
+ return 0;
}
static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
@@ -855,21 +893,17 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
if (val >= VLAN_VID_MASK)
return -EINVAL;
- if (!rtnl_trylock())
- return restart_syscall();
-
if (pvid == br->default_pvid)
- goto unlock;
+ goto out;
/* Only allow default pvid change when filtering is disabled */
if (br->vlan_enabled) {
pr_info_once("Please disable vlan filtering to change default_pvid\n");
err = -EPERM;
- goto unlock;
+ goto out;
}
err = __br_vlan_set_default_pvid(br, pvid);
-unlock:
- rtnl_unlock();
+out:
return err;
}
@@ -1020,3 +1054,30 @@ void nbp_vlan_flush(struct net_bridge_port *port)
synchronize_rcu();
__vlan_group_free(vg);
}
+
+void br_vlan_get_stats(const struct net_bridge_vlan *v,
+ struct br_vlan_stats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(i) {
+ u64 rxpackets, rxbytes, txpackets, txbytes;
+ struct br_vlan_stats *cpu_stats;
+ unsigned int start;
+
+ cpu_stats = per_cpu_ptr(v->stats, i);
+ do {
+ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ rxpackets = cpu_stats->rx_packets;
+ rxbytes = cpu_stats->rx_bytes;
+ txbytes = cpu_stats->tx_bytes;
+ txpackets = cpu_stats->tx_packets;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+ stats->rx_packets += rxpackets;
+ stats->rx_bytes += rxbytes;
+ stats->tx_bytes += txbytes;
+ stats->tx_packets += txpackets;
+ }
+}
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 7fcdd7261..a78c4e282 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -162,15 +162,57 @@ static const struct nf_chain_type filter_bridge = {
(1 << NF_BR_POST_ROUTING),
};
+static void nf_br_saveroute(const struct sk_buff *skb,
+ struct nf_queue_entry *entry)
+{
+}
+
+static int nf_br_reroute(struct net *net, struct sk_buff *skb,
+ const struct nf_queue_entry *entry)
+{
+ return 0;
+}
+
+static __sum16 nf_br_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u_int8_t protocol)
+{
+ return 0;
+}
+
+static __sum16 nf_br_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u_int8_t protocol)
+{
+ return 0;
+}
+
+static int nf_br_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict __always_unused)
+{
+ return 0;
+}
+
+static const struct nf_afinfo nf_br_afinfo = {
+ .family = AF_BRIDGE,
+ .checksum = nf_br_checksum,
+ .checksum_partial = nf_br_checksum_partial,
+ .route = nf_br_route,
+ .saveroute = nf_br_saveroute,
+ .reroute = nf_br_reroute,
+ .route_key_size = 0,
+};
+
static int __init nf_tables_bridge_init(void)
{
int ret;
+ nf_register_afinfo(&nf_br_afinfo);
nft_register_chain_type(&filter_bridge);
ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
- if (ret < 0)
+ if (ret < 0) {
nft_unregister_chain_type(&filter_bridge);
-
+ nf_unregister_afinfo(&nf_br_afinfo);
+ }
return ret;
}
@@ -178,6 +220,7 @@ static void __exit nf_tables_bridge_exit(void)
{
unregister_pernet_subsys(&nf_tables_bridge_net_ops);
nft_unregister_chain_type(&filter_bridge);
+ nf_unregister_afinfo(&nf_br_afinfo);
}
module_init(nf_tables_bridge_init);
diff --git a/net/can/raw.c b/net/can/raw.c
index 2e67b1423..972c187d4 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -755,7 +755,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
if (err < 0)
goto free_skb;
- sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(sk, sk->sk_tsflags, &skb_shinfo(skb)->tx_flags);
skb->dev = dev;
skb->sk = sk;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7..55d2bfee1 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
/*
* true if we have the mon map (and have thus joined the cluster)
*/
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
{
return client->monc.monmap && client->monc.monmap->epoch &&
client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19..3773a4fa1 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
}
}
+const char *ceph_osd_watch_op_name(int o)
+{
+ switch (o) {
+ case CEPH_OSD_WATCH_OP_UNWATCH:
+ return "unwatch";
+ case CEPH_OSD_WATCH_OP_WATCH:
+ return "watch";
+ case CEPH_OSD_WATCH_OP_RECONNECT:
+ return "reconnect";
+ case CEPH_OSD_WATCH_OP_PING:
+ return "ping";
+ default:
+ return "???";
+ }
+}
+
const char *ceph_osd_state_name(int s)
{
switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc78..e77b04ca7 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_client *client = s->private;
- struct ceph_osdmap *map = client->osdc.osdmap;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
struct rb_node *n;
if (map == NULL)
return 0;
- seq_printf(s, "epoch %d\n", map->epoch);
- seq_printf(s, "flags%s%s\n",
- (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
- (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
+ down_read(&osdc->lock);
+ seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
- struct ceph_pg_pool_info *pool =
+ struct ceph_pg_pool_info *pi =
rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
- pool->id, pool->pg_num, pool->pg_num_mask,
- pool->read_tier, pool->write_tier);
+ seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+ pi->id, pi->name, pi->type, pi->size, pi->min_size,
+ pi->pg_num, pi->pg_num_mask, pi->flags,
+ pi->last_force_request_resend, pi->read_tier,
+ pi->write_tier);
}
for (i = 0; i < map->max_osd; i++) {
struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
pg->pgid.seed, pg->primary_temp.osd);
}
+ up_read(&osdc->lock);
return 0;
}
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
seq_putc(s, '\n');
}
+ seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
return 0;
}
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
{
- struct ceph_client *client = s->private;
- struct ceph_osd_client *osdc = &client->osdc;
- struct rb_node *p;
+ int i;
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- struct ceph_osd_request *req;
- unsigned int i;
- int opcode;
+ seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+ for (i = 0; i < t->up.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+ seq_printf(s, "]/%d\t[", t->up.primary);
+ for (i = 0; i < t->acting.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+ seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ if (t->paused)
+ seq_puts(s, "\tP");
+}
- req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+ int i;
- seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1,
- req->r_pgid.pool, req->r_pgid.seed);
+ seq_printf(s, "%llu\t", req->r_tid);
+ dump_target(s, &req->r_t);
- seq_printf(s, "%.*s", req->r_base_oid.name_len,
- req->r_base_oid.name);
+ seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+ le32_to_cpu(req->r_replay_version.epoch),
+ le64_to_cpu(req->r_replay_version.version));
- if (req->r_reassert_version.epoch)
- seq_printf(s, "\t%u'%llu",
- (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
- le64_to_cpu(req->r_reassert_version.version));
- else
- seq_printf(s, "\t");
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+ ceph_osd_op_name(op->op));
+ if (op->op == CEPH_OSD_OP_WATCH)
+ seq_printf(s, "-%s",
+ ceph_osd_watch_op_name(op->watch.op));
+ }
+
+ seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ dump_request(s, req);
+ }
+
+ mutex_unlock(&osd->lock);
+}
- for (i = 0; i < req->r_num_ops; i++) {
- opcode = req->r_ops[i].op;
- seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
- ceph_osd_op_name(opcode));
- }
+static void dump_linger_request(struct seq_file *s,
+ struct ceph_osd_linger_request *lreq)
+{
+ seq_printf(s, "%llu\t", lreq->linger_id);
+ dump_target(s, &lreq->t);
+
+ seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+ lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+ lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ dump_linger_request(s, lreq);
+ }
+
+ mutex_unlock(&osd->lock);
+}
- seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *n;
+
+ down_read(&osdc->lock);
+ seq_printf(s, "REQUESTS %d homeless %d\n",
+ atomic_read(&osdc->num_requests),
+ atomic_read(&osdc->num_homeless));
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_requests(s, osd);
}
- mutex_unlock(&osdc->request_mutex);
+ dump_requests(s, &osdc->homeless_osd);
+
+ seq_puts(s, "LINGER REQUESTS\n");
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_linger_requests(s, osd);
+ }
+ dump_linger_requests(s, &osdc->homeless_osd);
+
+ up_read(&osdc->lock);
return 0;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009..37c38a7fb 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
BUG_ON(num < 1); /* monmap sub is always there */
ceph_encode_32(&p, num);
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
- const char *s = ceph_sub_str[i];
+ char buf[32];
+ int len;
if (!monc->subs[i].want)
continue;
- dout("%s %s start %llu flags 0x%x\n", __func__, s,
+ len = sprintf(buf, "%s", ceph_sub_str[i]);
+ if (i == CEPH_SUB_MDSMAP &&
+ monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+ len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+ dout("%s %s start %llu flags 0x%x\n", __func__, buf,
le64_to_cpu(monc->subs[i].item.start),
monc->subs[i].item.flags);
- ceph_encode_string(&p, end, s, strlen(s));
+ ceph_encode_string(&p, end, buf, len);
memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
p += sizeof(monc->subs[i].item);
}
- BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+ BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
}
EXPORT_SYMBOL(ceph_monc_got_map);
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
{
- dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex);
- if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
- monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
- __send_subscribe(monc);
+ __send_subscribe(monc);
mutex_unlock(&monc->mutex);
}
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
/*
* Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
/*
* generic requests (currently statfs, mon_get_version)
*/
-static struct ceph_mon_generic_request *__lookup_generic_req(
- struct ceph_mon_client *monc, u64 tid)
-{
- struct ceph_mon_generic_request *req;
- struct rb_node *n = monc->generic_request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mon_generic_request, node);
- if (tid < req->tid)
- n = n->rb_left;
- else if (tid > req->tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *new)
-{
- struct rb_node **p = &monc->generic_request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mon_generic_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mon_generic_request, node);
- if (new->tid < req->tid)
- p = &(*p)->rb_left;
- else if (new->tid > req->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
static void release_generic_request(struct kref *kref)
{
struct ceph_mon_generic_request *req =
container_of(kref, struct ceph_mon_generic_request, kref);
+ dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+ req->reply);
+ WARN_ON(!RB_EMPTY_NODE(&req->node));
+
if (req->reply)
ceph_msg_put(req->reply);
if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
static void put_generic_request(struct ceph_mon_generic_request *req)
{
- kref_put(&req->kref, release_generic_request);
+ if (req)
+ kref_put(&req->kref, release_generic_request);
}
static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
kref_get(&req->kref);
}
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = kzalloc(sizeof(*req), gfp);
+ if (!req)
+ return NULL;
+
+ req->monc = monc;
+ kref_init(&req->kref);
+ RB_CLEAR_NODE(&req->node);
+ init_completion(&req->completion);
+
+ dout("%s greq %p\n", __func__, req);
+ return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ WARN_ON(req->tid);
+
+ get_generic_request(req);
+ req->tid = ++monc->last_tid;
+ insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *req)
+{
+ WARN_ON(!req->tid);
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ erase_generic_request(&monc->generic_request_tree, req);
+
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ __finish_generic_request(req);
+ put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+ if (req->complete_cb)
+ req->complete_cb(req);
+ else
+ complete_all(&req->completion);
+ put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+ struct ceph_mon_generic_request *lookup_req;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+ mutex_lock(&monc->mutex);
+ lookup_req = lookup_generic_request(&monc->generic_request_tree,
+ req->tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ finish_generic_request(req);
+ }
+
+ mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+ int ret;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ ret = wait_for_completion_interruptible(&req->completion);
+ if (ret)
+ cancel_generic_request(req);
+ else
+ ret = req->result; /* completed */
+
+ return ret;
+}
+
static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg *m;
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
if (!req) {
dout("get_generic_reply %lld dne\n", tid);
*skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
return m;
}
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- /* register request */
- req->tid = tid != 0 ? tid : ++monc->last_tid;
- req->request->hdr.tid = cpu_to_le64(req->tid);
- __insert_generic_request(monc, req);
- monc->num_generic_requests++;
- ceph_con_send(&monc->con, ceph_msg_get(req->request));
- mutex_unlock(&monc->mutex);
-
- err = wait_for_completion_interruptible(&req->completion);
-
- mutex_lock(&monc->mutex);
- rb_erase(&req->node, &monc->generic_request_tree);
- monc->num_generic_requests--;
-
- if (!err)
- err = req->result;
- return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- mutex_lock(&monc->mutex);
- err = __do_generic_request(monc, 0, req);
- mutex_unlock(&monc->mutex);
-
- return err;
-}
-
/*
* statfs
*/
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
u64 tid = le64_to_cpu(msg->hdr.tid);
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
if (msg->front.iov_len != sizeof(*reply))
goto bad;
- dout("handle_statfs_reply %p tid %llu\n", msg, tid);
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (req) {
- *(struct ceph_statfs *)req->buf = reply->st;
- req->result = 0;
- get_generic_request(req);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
}
+
+ req->result = 0;
+ *req->u.st = reply->st; /* struct */
+ __finish_generic_request(req);
mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
+
+ complete_generic_request(req);
return;
bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h;
- int err;
+ int ret = -ENOMEM;
- req = kzalloc(sizeof(*req), GFP_NOFS);
+ req = alloc_generic_request(monc, GFP_NOFS);
if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = buf;
- init_completion(&req->completion);
+ goto out;
- err = -ENOMEM;
req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
true);
if (!req->request)
goto out;
- req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
- true);
+
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
if (!req->reply)
goto out;
+ req->u.st = buf;
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
/* fill out request */
h = req->request->front.iov_base;
h->monhdr.have_version = 0;
h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
- err = do_generic_request(monc, req);
-
+ ret = wait_generic_request(req);
out:
put_generic_request(req);
- return err;
+ return ret;
}
EXPORT_SYMBOL(ceph_monc_do_statfs);
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
void *end = p + msg->front_alloc_len;
u64 handle;
- dout("%s %p tid %llu\n", __func__, msg, tid);
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
ceph_decode_need(&p, end, 2*sizeof(u64), bad);
handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
goto bad;
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, handle);
- if (req) {
- *(u64 *)req->buf = ceph_decode_64(&p);
- req->result = 0;
- get_generic_request(req);
+ req = lookup_generic_request(&monc->generic_request_tree, handle);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
}
+
+ req->result = 0;
+ req->u.newest = ceph_decode_64(&p);
+ __finish_generic_request(req);
mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
+ complete_generic_request(req);
return;
+
bad:
pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
ceph_msg_dump(msg);
}
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
- u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
{
struct ceph_mon_generic_request *req;
- void *p, *end;
- u64 tid;
- int err;
- req = kzalloc(sizeof(*req), GFP_NOFS);
+ req = alloc_generic_request(monc, GFP_NOIO);
if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = newest;
- init_completion(&req->completion);
+ goto err_put_req;
req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
sizeof(u64) + sizeof(u32) + strlen(what),
- GFP_NOFS, true);
- if (!req->request) {
- err = -ENOMEM;
- goto out;
- }
+ GFP_NOIO, true);
+ if (!req->request)
+ goto err_put_req;
- req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
- GFP_NOFS, true);
- if (!req->reply) {
- err = -ENOMEM;
- goto out;
- }
+ req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+ true);
+ if (!req->reply)
+ goto err_put_req;
- p = req->request->front.iov_base;
- end = p + req->request->front_alloc_len;
+ req->complete_cb = cb;
+ req->private_data = private_data;
- /* fill out request */
mutex_lock(&monc->mutex);
- tid = ++monc->last_tid;
- ceph_encode_64(&p, tid); /* handle */
- ceph_encode_string(&p, end, what, strlen(what));
+ register_generic_request(req);
+ {
+ void *p = req->request->front.iov_base;
+ void *const end = p + req->request->front_alloc_len;
+
+ ceph_encode_64(&p, req->tid); /* handle */
+ ceph_encode_string(&p, end, what, strlen(what));
+ WARN_ON(p != end);
+ }
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
- err = __do_generic_request(monc, tid, req);
+ return req;
- mutex_unlock(&monc->mutex);
-out:
+err_put_req:
put_generic_request(req);
- return err;
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest)
+{
+ struct ceph_mon_generic_request *req;
+ int ret;
+
+ req = __ceph_monc_get_version(monc, what, NULL, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = wait_generic_request(req);
+ if (!ret)
+ *newest = req->u.newest;
+
+ put_generic_request(req);
+ return ret;
}
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = __ceph_monc_get_version(monc, what, cb, private_data);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ put_generic_request(req);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
/*
* Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
if (!monc->m_subscribe_ack)
goto out_auth;
- monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
true);
if (!monc->m_subscribe)
goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT;
- monc->num_generic_requests = 0;
monc->last_tid = 0;
+ monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
return 0;
out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
ceph_auth_destroy(monc->auth);
+ WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
ceph_msg_put(monc->m_auth);
ceph_msg_put(monc->m_auth_reply);
ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70e..894695920 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
-#define OSD_OP_FRONT_LEN 4096
#define OSD_OPREPLY_FRONT_LEN 512
static struct kmem_cache *ceph_osd_request_cache;
static const struct ceph_connection_operations osd_con_ops;
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-
/*
* Implement client access to distributed object storage cluster.
*
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
* channel with an OSD is reset.
*/
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+ bool wrlocked = true;
+
+ if (unlikely(down_read_trylock(sem))) {
+ wrlocked = false;
+ up_read(sem);
+ }
+
+ return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ WARN_ON(!(mutex_is_locked(&osd->lock) &&
+ rwsem_is_locked(&osdc->lock)) &&
+ !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+ WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
/*
* calculate the mapping of a file extent onto an object, and fill out the
* request accordingly. shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data);
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
- unsigned int which)
-{
- return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
-
void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
unsigned int which, struct page **pages,
u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_pagelist_init(osd_data, pagelist);
+ osd_req->r_ops[which].cls.indata_len += pagelist->length;
+ osd_req->r_ops[which].indata_len += pagelist->length;
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
pages_from_pool, own_pages);
+ osd_req->r_ops[which].cls.indata_len += length;
+ osd_req->r_ops[which].indata_len += length;
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_STAT:
ceph_osd_data_release(&op->raw_data_in);
break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osd_data_release(&op->notify_ack.request_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osd_data_release(&op->notify.request_data);
+ ceph_osd_data_release(&op->notify.response_data);
+ break;
default:
break;
}
}
/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+ ceph_oid_init(&t->base_oid);
+ ceph_oloc_init(&t->base_oloc);
+ ceph_oid_init(&t->target_oid);
+ ceph_oloc_init(&t->target_oloc);
+
+ ceph_osds_init(&t->acting);
+ ceph_osds_init(&t->up);
+ t->size = -1;
+ t->min_size = -1;
+
+ t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+ const struct ceph_osd_request_target *src)
+{
+ ceph_oid_copy(&dest->base_oid, &src->base_oid);
+ ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+ ceph_oid_copy(&dest->target_oid, &src->target_oid);
+ ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+ dest->pgid = src->pgid; /* struct */
+ dest->pg_num = src->pg_num;
+ dest->pg_num_mask = src->pg_num_mask;
+ ceph_osds_copy(&dest->acting, &src->acting);
+ ceph_osds_copy(&dest->up, &src->up);
+ dest->size = src->size;
+ dest->min_size = src->min_size;
+ dest->sort_bitwise = src->sort_bitwise;
+
+ dest->flags = src->flags;
+ dest->paused = src->paused;
+
+ dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+ ceph_oid_destroy(&t->base_oid);
+ ceph_oid_destroy(&t->target_oid);
+}
+
+/*
* requests
*/
+static void request_release_checks(struct ceph_osd_request *req)
+{
+ WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+ WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+ WARN_ON(!list_empty(&req->r_unsafe_item));
+ WARN_ON(req->r_osd);
+}
+
static void ceph_osdc_release_request(struct kref *kref)
{
struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
req->r_request, req->r_reply);
- WARN_ON(!RB_EMPTY_NODE(&req->r_node));
- WARN_ON(!list_empty(&req->r_req_lru_item));
- WARN_ON(!list_empty(&req->r_osd_item));
- WARN_ON(!list_empty(&req->r_linger_item));
- WARN_ON(!list_empty(&req->r_linger_osd_item));
- WARN_ON(req->r_osd);
+ request_release_checks(req);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
- ceph_msg_revoke_incoming(req->r_reply);
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- }
for (which = 0; which < req->r_num_ops; which++)
osd_req_op_data_release(req, which);
+ target_destroy(&req->r_t);
ceph_put_snap_context(req->r_snapc);
+
if (req->r_mempool)
mempool_free(req, req->r_osdc->req_mempool);
else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
void ceph_osdc_put_request(struct ceph_osd_request *req)
{
- dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
- kref_put(&req->r_kref, ceph_osdc_release_request);
+ if (req) {
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+ }
}
EXPORT_SYMBOL(ceph_osdc_put_request);
+static void request_init(struct ceph_osd_request *req)
+{
+ /* req only, each op is zeroed in _osd_req_op_init() */
+ memset(req, 0, sizeof(*req));
+
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ RB_CLEAR_NODE(&req->r_node);
+ RB_CLEAR_NODE(&req->r_mc_node);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+
+ target_init(&req->r_t);
+}
+
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable. Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ bool mempool = req->r_mempool;
+ unsigned int num_ops = req->r_num_ops;
+ u64 snapid = req->r_snapid;
+ struct ceph_snap_context *snapc = req->r_snapc;
+ bool linger = req->r_linger;
+ struct ceph_msg *request_msg = req->r_request;
+ struct ceph_msg *reply_msg = req->r_reply;
+
+ dout("%s req %p\n", __func__, req);
+ WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+ request_release_checks(req);
+
+ WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+ WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+ target_destroy(&req->r_t);
+
+ request_init(req);
+ req->r_osdc = osdc;
+ req->r_mempool = mempool;
+ req->r_num_ops = num_ops;
+ req->r_snapid = snapid;
+ req->r_snapc = snapc;
+ req->r_linger = linger;
+ req->r_request = request_msg;
+ req->r_reply = reply_msg;
+}
+
struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
gfp_t gfp_flags)
{
struct ceph_osd_request *req;
- struct ceph_msg *msg;
- size_t msg_size;
if (use_mempool) {
BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
if (unlikely(!req))
return NULL;
- /* req only, each op is zeroed in _osd_req_op_init() */
- memset(req, 0, sizeof(*req));
-
+ request_init(req);
req->r_osdc = osdc;
req->r_mempool = use_mempool;
req->r_num_ops = num_ops;
+ req->r_snapid = CEPH_NOSNAP;
+ req->r_snapc = ceph_get_snap_context(snapc);
- kref_init(&req->r_kref);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- RB_CLEAR_NODE(&req->r_node);
- INIT_LIST_HEAD(&req->r_unsafe_item);
- INIT_LIST_HEAD(&req->r_linger_item);
- INIT_LIST_HEAD(&req->r_linger_osd_item);
- INIT_LIST_HEAD(&req->r_req_lru_item);
- INIT_LIST_HEAD(&req->r_osd_item);
-
- req->r_base_oloc.pool = -1;
- req->r_target_oloc.pool = -1;
+ dout("%s req %p\n", __func__, req);
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
- msg_size = OSD_OPREPLY_FRONT_LEN;
- if (num_ops > CEPH_OSD_SLAB_OPS) {
- /* ceph_osd_op and rval */
- msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
- (sizeof(struct ceph_osd_op) + 4);
- }
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_msg *msg;
+ int msg_size;
- /* create reply message */
- if (use_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
- else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
- gfp_flags, true);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
- req->r_reply = msg;
+ WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ /* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
- msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
- msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+ msg_size += 4 + req->r_base_oid.name_len; /* oid */
+ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
- msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
msg_size += 4; /* retry_attempt */
- /* create request message; allow space for oid */
- if (use_mempool)
+ if (req->r_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+ if (!msg)
+ return -ENOMEM;
memset(msg->front.iov_base, 0, msg->front.iov_len);
-
req->r_request = msg;
- return req;
+ /* create reply message */
+ msg_size = OSD_OPREPLY_FRONT_LEN;
+ msg_size += req->r_base_oid.name_len;
+ msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+ if (req->r_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+ if (!msg)
+ return -ENOMEM;
+
+ req->r_reply = msg;
+
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
static bool osd_req_opcode_valid(u16 opcode)
{
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
- op->cls.argc = 0; /* currently unused */
-
op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode,
- u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+ u64 cookie, u8 watch_opcode)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
-
- BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+ struct ceph_osd_req_op *op;
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
op->watch.cookie = cookie;
- op->watch.ver = version;
- if (opcode == CEPH_OSD_OP_WATCH && flag)
- op->watch.flag = (u8)1;
+ op->watch.op = watch_opcode;
+ op->watch.gen = 0;
}
-EXPORT_SYMBOL(osd_req_op_watch_init);
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
}
}
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
- struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+ const struct ceph_osd_req_op *src)
{
- struct ceph_osd_req_op *src;
- struct ceph_osd_data *osd_data;
- u64 request_data_len = 0;
- u64 data_length;
-
- BUG_ON(which >= req->r_num_ops);
- src = &req->r_ops[which];
if (WARN_ON(!osd_req_opcode_valid(src->op))) {
pr_err("unrecognized osd opcode %d\n", src->op);
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
switch (src->op) {
case CEPH_OSD_OP_STAT:
- osd_data = &src->raw_data_in;
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
break;
case CEPH_OSD_OP_READ:
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
case CEPH_OSD_OP_ZERO:
case CEPH_OSD_OP_TRUNCATE:
- if (src->op == CEPH_OSD_OP_WRITE ||
- src->op == CEPH_OSD_OP_WRITEFULL)
- request_data_len = src->extent.length;
dst->extent.offset = cpu_to_le64(src->extent.offset);
dst->extent.length = cpu_to_le64(src->extent.length);
dst->extent.truncate_size =
cpu_to_le64(src->extent.truncate_size);
dst->extent.truncate_seq =
cpu_to_le32(src->extent.truncate_seq);
- osd_data = &src->extent.osd_data;
- if (src->op == CEPH_OSD_OP_WRITE ||
- src->op == CEPH_OSD_OP_WRITEFULL)
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- else
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
break;
case CEPH_OSD_OP_CALL:
dst->cls.class_len = src->cls.class_len;
dst->cls.method_len = src->cls.method_len;
- osd_data = &src->cls.request_info;
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
- request_data_len = osd_data->pagelist->length;
-
- osd_data = &src->cls.request_data;
- data_length = ceph_osd_data_length(osd_data);
- if (data_length) {
- BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
- dst->cls.indata_len = cpu_to_le32(data_length);
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- src->indata_len += data_length;
- request_data_len += data_length;
- }
- osd_data = &src->cls.response_data;
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
+ dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
break;
case CEPH_OSD_OP_STARTSYNC:
break;
- case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_WATCH:
dst->watch.cookie = cpu_to_le64(src->watch.cookie);
- dst->watch.ver = cpu_to_le64(src->watch.ver);
- dst->watch.flag = src->watch.flag;
+ dst->watch.ver = cpu_to_le64(0);
+ dst->watch.op = src->watch.op;
+ dst->watch.gen = cpu_to_le32(src->watch.gen);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ dst->notify.cookie = cpu_to_le64(src->notify.cookie);
break;
case CEPH_OSD_OP_SETALLOCHINT:
dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
dst->xattr.cmp_op = src->xattr.cmp_op;
dst->xattr.cmp_mode = src->xattr.cmp_mode;
- osd_data = &src->xattr.osd_data;
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- request_data_len = osd_data->pagelist->length;
break;
case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->indata_len);
- return request_data_len;
+ return src->indata_len;
}
/*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
GFP_NOFS);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->r_flags = flags;
+ if (!req) {
+ r = -ENOMEM;
+ goto fail;
+ }
/* calculate max write size */
r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
- if (r < 0) {
- ceph_osdc_put_request(req);
- return ERR_PTR(r);
- }
+ if (r)
+ goto fail;
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
+ req->r_flags = flags;
req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
+
+ req->r_snapid = vino.snap;
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ req->r_data_offset = off;
- snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
- "%llx.%08llx", vino.ino, objnum);
- req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+ r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (r)
+ goto fail;
return req;
+
+fail:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(r);
}
EXPORT_SYMBOL(ceph_osdc_new_request);
/*
* We keep osd requests in an rbtree, sorted by ->r_tid.
*/
-static void __insert_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *new)
-{
- struct rb_node **p = &osdc->requests.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_osd_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &osdc->requests);
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
- u64 tid)
+static bool osd_homeless(struct ceph_osd *osd)
{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
+ return osd->o_osd == CEPH_HOMELESS_OSD;
}
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
- u64 tid)
+static bool osd_registered(struct ceph_osd *osd)
{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid) {
- if (!n->rb_left)
- return req;
- n = n->rb_left;
- } else if (tid > req->r_tid) {
- n = n->rb_right;
- } else {
- return req;
- }
- }
- return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
- struct ceph_osd_client *osdc = req->r_osdc;
- struct ceph_osd *osd = req->r_osd;
+ verify_osdc_locked(osd->o_osdc);
- /*
- * Linger requests need to be resent with a new tid to avoid
- * the dup op detection logic on the OSDs. Achieve this with
- * a re-register dance instead of open-coding.
- */
- ceph_osdc_get_request(req);
- if (!list_empty(&req->r_linger_item))
- __unregister_linger_request(osdc, req);
- else
- __unregister_request(osdc, req);
- __register_request(osdc, req);
- ceph_osdc_put_request(req);
-
- /*
- * Unless request has been registered as both normal and
- * lingering, __unregister{,_linger}_request clears r_osd.
- * However, here we need to preserve r_osd to make sure we
- * requeue on the same OSD.
- */
- WARN_ON(req->r_osd || !osd);
- req->r_osd = osd;
-
- dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
- __enqueue_request(req);
+ return !RB_EMPTY_NODE(&osd->o_node);
}
/*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
*/
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
+static void osd_init(struct ceph_osd *osd)
{
- struct ceph_osd_request *req, *nreq;
- LIST_HEAD(resend);
- LIST_HEAD(resend_linger);
- int err;
-
- dout("%s osd%d\n", __func__, osd->o_osd);
- err = __reset_osd(osdc, osd);
- if (err)
- return;
-
- /*
- * Build up a list of requests to resend by traversing the
- * osd's list of requests. Requests for a given object are
- * sent in tid order, and that is also the order they're
- * kept on this list. Therefore all requests that are in
- * flight will be found first, followed by all requests that
- * have not yet been sent. And to resend requests while
- * preserving this order we will want to put any sent
- * requests back on the front of the osd client's unsent
- * list.
- *
- * So we build a separate ordered list of already-sent
- * requests for the affected osd and splice it onto the
- * front of the osd client's unsent list. Once we've seen a
- * request that has not yet been sent we're done. Those
- * requests are already sitting right where they belong.
- */
- list_for_each_entry(req, &osd->o_requests, r_osd_item) {
- if (!req->r_sent)
- break;
-
- if (!req->r_linger) {
- dout("%s requeueing %p tid %llu\n", __func__, req,
- req->r_tid);
- list_move_tail(&req->r_req_lru_item, &resend);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- } else {
- list_move_tail(&req->r_req_lru_item, &resend_linger);
- }
- }
- list_splice(&resend, &osdc->req_unsent);
-
- /*
- * Both registered and not yet registered linger requests are
- * enqueued with a new tid on the same OSD. We add/move them
- * to req_unsent/o_requests at the end to keep things in tid
- * order.
- */
- list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
- r_linger_osd_item) {
- WARN_ON(!list_empty(&req->r_req_lru_item));
- __kick_linger_request(req);
- }
-
- list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
- __kick_linger_request(req);
+ atomic_set(&osd->o_ref, 1);
+ RB_CLEAR_NODE(&osd->o_node);
+ osd->o_requests = RB_ROOT;
+ osd->o_linger_requests = RB_ROOT;
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ osd->o_incarnation = 1;
+ mutex_init(&osd->lock);
}
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static void osd_cleanup(struct ceph_osd *osd)
{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
-
- if (!osd)
- return;
- dout("osd_reset osd%d\n", osd->o_osd);
- osdc = osd->o_osdc;
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- __kick_osd_requests(osdc, osd);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ WARN_ON(!list_empty(&osd->o_osd_lru));
+ WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+ if (osd->o_auth.authorizer) {
+ WARN_ON(osd_homeless(osd));
+ ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ }
}
/*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
{
struct ceph_osd *osd;
- osd = kzalloc(sizeof(*osd), GFP_NOFS);
- if (!osd)
- return NULL;
+ WARN_ON(onum == CEPH_HOMELESS_OSD);
- atomic_set(&osd->o_ref, 1);
+ osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+ osd_init(osd);
osd->o_osdc = osdc;
osd->o_osd = onum;
- RB_CLEAR_NODE(&osd->o_node);
- INIT_LIST_HEAD(&osd->o_requests);
- INIT_LIST_HEAD(&osd->o_linger_requests);
- INIT_LIST_HEAD(&osd->o_osd_lru);
- osd->o_incarnation = 1;
ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
- INIT_LIST_HEAD(&osd->o_keepalive_item);
return osd;
}
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
atomic_read(&osd->o_ref) - 1);
if (atomic_dec_and_test(&osd->o_ref)) {
- if (osd->o_auth.authorizer)
- ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ osd_cleanup(osd);
kfree(osd);
}
}
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
- WARN_ON(!list_empty(&osd->o_requests));
- WARN_ON(!list_empty(&osd->o_linger_requests));
-
- list_del_init(&osd->o_osd_lru);
- rb_erase(&osd->o_node, &osdc->osds);
- RB_CLEAR_NODE(&osd->o_node);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
- if (!RB_EMPTY_NODE(&osd->o_node)) {
- ceph_con_close(&osd->o_con);
- __remove_osd(osdc, osd);
- put_osd(osd);
- }
-}
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
-static void remove_all_osds(struct ceph_osd_client *osdc)
+static void __move_osd_to_lru(struct ceph_osd *osd)
{
- dout("%s %p\n", __func__, osdc);
- mutex_lock(&osdc->request_mutex);
- while (!RB_EMPTY_ROOT(&osdc->osds)) {
- struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
- struct ceph_osd, o_node);
- remove_osd(osdc, osd);
- }
- mutex_unlock(&osdc->request_mutex);
-}
+ struct ceph_osd_client *osdc = osd->o_osdc;
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
-{
- dout("%s %p\n", __func__, osd);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
BUG_ON(!list_empty(&osd->o_osd_lru));
+ spin_lock(&osdc->osd_lru_lock);
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
+
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
}
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
{
- dout("%s %p\n", __func__, osd);
-
- if (list_empty(&osd->o_requests) &&
- list_empty(&osd->o_linger_requests))
- __move_osd_to_lru(osdc, osd);
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests))
+ __move_osd_to_lru(osd);
}
static void __remove_osd_from_lru(struct ceph_osd *osd)
{
- dout("__remove_osd_from_lru %p\n", osd);
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ spin_lock(&osdc->osd_lru_lock);
if (!list_empty(&osd->o_osd_lru))
list_del_init(&osd->o_osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
}
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
{
- struct ceph_osd *osd, *nosd;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
- dout("__remove_old_osds %p\n", osdc);
- mutex_lock(&osdc->request_mutex);
- list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
- if (time_before(jiffies, osd->lru_ttl))
- break;
- remove_osd(osdc, osd);
+ verify_osdc_wrlocked(osdc);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ ceph_con_close(&osd->o_con);
+
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ n = rb_next(n); /* unlink_request() */
+
+ dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+ unlink_request(osd, req);
+ link_request(&osdc->homeless_osd, req);
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ n = rb_next(n); /* unlink_linger() */
+
+ dout(" reassigning lreq %p linger_id %llu\n", lreq,
+ lreq->linger_id);
+ unlink_linger(osd, lreq);
+ link_linger(&osdc->homeless_osd, lreq);
}
- mutex_unlock(&osdc->request_mutex);
+
+ __remove_osd_from_lru(osd);
+ erase_osd(&osdc->osds, osd);
+ put_osd(osd);
}
/*
* reset osd connect
*/
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
{
struct ceph_entity_addr *peer_addr;
- dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
- if (list_empty(&osd->o_requests) &&
- list_empty(&osd->o_linger_requests)) {
- remove_osd(osdc, osd);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+ close_osd(osd);
return -ENODEV;
}
- peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+ peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
!ceph_con_opened(&osd->o_con)) {
- struct ceph_osd_request *req;
+ struct rb_node *n;
dout("osd addr hasn't changed and connection never opened, "
"letting msgr retry\n");
/* touch each r_stamp for handle_timeout()'s benfit */
- list_for_each_entry(req, &osd->o_requests, r_osd_item)
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
req->r_stamp = jiffies;
+ }
return -EAGAIN;
}
@@ -1206,455 +1170,1369 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
return 0;
}
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+ bool wrlocked)
{
- struct rb_node **p = &osdc->osds.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd *osd = NULL;
+ struct ceph_osd *osd;
- dout("__insert_osd %p osd%d\n", new, new->o_osd);
- while (*p) {
- parent = *p;
- osd = rb_entry(parent, struct ceph_osd, o_node);
- if (new->o_osd < osd->o_osd)
- p = &(*p)->rb_left;
- else if (new->o_osd > osd->o_osd)
- p = &(*p)->rb_right;
- else
- BUG();
+ if (wrlocked)
+ verify_osdc_wrlocked(osdc);
+ else
+ verify_osdc_locked(osdc);
+
+ if (o != CEPH_HOMELESS_OSD)
+ osd = lookup_osd(&osdc->osds, o);
+ else
+ osd = &osdc->homeless_osd;
+ if (!osd) {
+ if (!wrlocked)
+ return ERR_PTR(-EAGAIN);
+
+ osd = create_osd(osdc, o);
+ insert_osd(&osdc->osds, osd);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+ &osdc->osdmap->osd_addr[osd->o_osd]);
}
- rb_link_node(&new->o_node, parent, p);
- rb_insert_color(&new->o_node, &osdc->osds);
+ dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+ return osd;
}
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
{
- struct ceph_osd *osd;
- struct rb_node *n = osdc->osds.rb_node;
-
- while (n) {
- osd = rb_entry(n, struct ceph_osd, o_node);
- if (o < osd->o_osd)
- n = n->rb_left;
- else if (o > osd->o_osd)
- n = n->rb_right;
- else
- return osd;
- }
- return NULL;
+ verify_osd_locked(osd);
+ WARN_ON(!req->r_tid || req->r_osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_request(&osd->o_requests, req);
+ req->r_osd = osd;
}
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
{
- schedule_delayed_work(&osdc->timeout_work,
- osdc->client->options->osd_keepalive_timeout);
+ verify_osd_locked(osd);
+ WARN_ON(req->r_osd != osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ req->r_osd = NULL;
+ erase_request(&osd->o_requests, req);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
}
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
{
- cancel_delayed_work(&osdc->timeout_work);
+ return pi->flags & CEPH_POOL_FLAG_FULL;
}
-/*
- * Register request, assign tid. If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static bool have_pool_full(struct ceph_osd_client *osdc)
{
- req->r_tid = ++osdc->last_tid;
- req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
- dout("__register_request %p tid %lld\n", req, req->r_tid);
- __insert_request(osdc, req);
- ceph_osdc_get_request(req);
- osdc->num_requests++;
- if (osdc->num_requests == 1) {
- dout(" first request, scheduling timeout\n");
- __schedule_osd_timeout(osdc);
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ if (__pool_full(pi))
+ return true;
}
+
+ return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return __pool_full(pi);
}
/*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
*/
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+ const struct ceph_osd_request_target *t,
+ struct ceph_pg_pool_info *pi)
{
- if (RB_EMPTY_NODE(&req->r_node)) {
- dout("__unregister_request %p tid %lld not registered\n",
- req, req->r_tid);
- return;
+ bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ __pool_full(pi);
+
+ WARN_ON(pi->id != t->base_oloc.pool);
+ return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+ (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+enum calc_target_result {
+ CALC_TARGET_NO_ACTION = 0,
+ CALC_TARGET_NEED_RESEND,
+ CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+ struct ceph_osd_request_target *t,
+ u32 *last_force_resend,
+ bool any_change)
+{
+ struct ceph_pg_pool_info *pi;
+ struct ceph_pg pgid, last_pgid;
+ struct ceph_osds up, acting;
+ bool force_resend = false;
+ bool need_check_tiering = false;
+ bool need_resend = false;
+ bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+ enum calc_target_result ct_res;
+ int ret;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+ if (!pi) {
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
}
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &osdc->requests);
- RB_CLEAR_NODE(&req->r_node);
- osdc->num_requests--;
+ if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+ if (last_force_resend &&
+ *last_force_resend < pi->last_force_request_resend) {
+ *last_force_resend = pi->last_force_request_resend;
+ force_resend = true;
+ } else if (!last_force_resend) {
+ force_resend = true;
+ }
+ }
+ if (ceph_oid_empty(&t->target_oid) || force_resend) {
+ ceph_oid_copy(&t->target_oid, &t->base_oid);
+ need_check_tiering = true;
+ }
+ if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+ ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+ need_check_tiering = true;
+ }
- if (req->r_osd) {
- /* make sure the original request isn't in flight. */
- ceph_msg_revoke(req->r_request);
+ if (need_check_tiering &&
+ (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+ t->target_oloc.pool = pi->read_tier;
+ if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+ t->target_oloc.pool = pi->write_tier;
+ }
- list_del_init(&req->r_osd_item);
- maybe_move_osd_to_lru(osdc, req->r_osd);
- if (list_empty(&req->r_linger_osd_item))
- req->r_osd = NULL;
+ ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+ &t->target_oloc, &pgid);
+ if (ret) {
+ WARN_ON(ret != -ENOENT);
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
+ }
+ last_pgid.pool = pgid.pool;
+ last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+ ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+ if (any_change &&
+ ceph_is_new_interval(&t->acting,
+ &acting,
+ &t->up,
+ &up,
+ t->size,
+ pi->size,
+ t->min_size,
+ pi->min_size,
+ t->pg_num,
+ pi->pg_num,
+ t->sort_bitwise,
+ sort_bitwise,
+ &last_pgid))
+ force_resend = true;
+
+ if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+ t->paused = false;
+ need_resend = true;
}
- list_del_init(&req->r_req_lru_item);
- ceph_osdc_put_request(req);
+ if (ceph_pg_compare(&t->pgid, &pgid) ||
+ ceph_osds_changed(&t->acting, &acting, any_change) ||
+ force_resend) {
+ t->pgid = pgid; /* struct */
+ ceph_osds_copy(&t->acting, &acting);
+ ceph_osds_copy(&t->up, &up);
+ t->size = pi->size;
+ t->min_size = pi->min_size;
+ t->pg_num = pi->pg_num;
+ t->pg_num_mask = pi->pg_num_mask;
+ t->sort_bitwise = sort_bitwise;
+
+ t->osd = acting.primary;
+ need_resend = true;
+ }
+
+ ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+ dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+ return ct_res;
+}
+
+static void setup_request_data(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ u32 data_len = 0;
+ int i;
+
+ if (!list_empty(&msg->data))
+ return;
+
+ WARN_ON(msg->data_length);
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ WARN_ON(op->indata_len != op->extent.length);
+ ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ WARN_ON(op->indata_len != op->xattr.name_len +
+ op->xattr.value_len);
+ ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osdc_msg_data_add(msg,
+ &op->notify_ack.request_data);
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->raw_data_in);
+ break;
+ case CEPH_OSD_OP_READ:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->extent.osd_data);
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_CALL:
+ WARN_ON(op->indata_len != op->cls.class_len +
+ op->cls.method_len +
+ op->cls.indata_len);
+ ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->cls.response_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osdc_msg_data_add(msg,
+ &op->notify.request_data);
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->notify.response_data);
+ break;
+ }
+
+ data_len += op->indata_len;
+ }
+
+ WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ u32 data_len = 0;
+ int i;
+
+ if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+ /* snapshots aren't writeable */
+ WARN_ON(req->r_snapid != CEPH_NOSNAP);
+ } else {
+ WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+ req->r_data_offset || req->r_snapc);
+ }
+
+ setup_request_data(req, msg);
+
+ ceph_encode_32(&p, 1); /* client_inc, always 1 */
+ ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+ ceph_encode_32(&p, req->r_flags);
+ ceph_encode_timespec(p, &req->r_mtime);
+ p += sizeof(struct ceph_timespec);
+ /* aka reassert_version */
+ memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+ p += sizeof(req->r_replay_version);
+
+ /* oloc */
+ ceph_encode_8(&p, 4);
+ ceph_encode_8(&p, 4);
+ ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_encode_64(&p, req->r_t.target_oloc.pool);
+ ceph_encode_32(&p, -1); /* preferred */
+ ceph_encode_32(&p, 0); /* key len */
+
+ /* pgid */
+ ceph_encode_8(&p, 1);
+ ceph_encode_64(&p, req->r_t.pgid.pool);
+ ceph_encode_32(&p, req->r_t.pgid.seed);
+ ceph_encode_32(&p, -1); /* preferred */
+
+ /* oid */
+ ceph_encode_32(&p, req->r_t.target_oid.name_len);
+ memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+ p += req->r_t.target_oid.name_len;
- if (osdc->num_requests == 0) {
- dout(" no requests, canceling timeout\n");
- __cancel_osd_timeout(osdc);
+ /* ops, can imply data */
+ ceph_encode_16(&p, req->r_num_ops);
+ for (i = 0; i < req->r_num_ops; i++) {
+ data_len += osd_req_encode_op(p, &req->r_ops[i]);
+ p += sizeof(struct ceph_osd_op);
}
+
+ ceph_encode_64(&p, req->r_snapid); /* snapid */
+ if (req->r_snapc) {
+ ceph_encode_64(&p, req->r_snapc->seq);
+ ceph_encode_32(&p, req->r_snapc->num_snaps);
+ for (i = 0; i < req->r_snapc->num_snaps; i++)
+ ceph_encode_64(&p, req->r_snapc->snaps[i]);
+ } else {
+ ceph_encode_64(&p, 0); /* snap_seq */
+ ceph_encode_32(&p, 0); /* snaps len */
+ }
+
+ ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ msg->hdr.data_len = cpu_to_le32(data_len);
+ /*
+ * The header "data_off" is a hint to the receiver allowing it
+ * to align received data into its buffers such that there's no
+ * need to re-copy it before writing it to disk (direct I/O).
+ */
+ msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+ dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__,
+ req, req->r_t.target_oid.name, req->r_t.target_oid.name_len,
+ msg->front.iov_len, data_len);
}
/*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
*/
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
{
- if (req->r_sent && req->r_osd) {
+ struct ceph_osd *osd = req->r_osd;
+
+ verify_osd_locked(osd);
+ WARN_ON(osd->o_osd != req->r_t.osd);
+
+ /*
+ * We may have a previously queued request message hanging
+ * around. Cancel it to avoid corrupting the msgr.
+ */
+ if (req->r_sent)
ceph_msg_revoke(req->r_request);
- req->r_sent = 0;
+
+ req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+ if (req->r_attempts)
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ else
+ WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+ encode_request(req, req->r_request);
+
+ dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+ __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+ req->r_t.osd, req->r_flags, req->r_attempts);
+
+ req->r_t.paused = false;
+ req->r_stamp = jiffies;
+ req->r_attempts++;
+
+ req->r_sent = osd->o_incarnation;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+ bool continuous = false;
+
+ verify_osdc_locked(osdc);
+ WARN_ON(!osdc->osdmap->epoch);
+
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("%s osdc %p continuous\n", __func__, osdc);
+ continuous = true;
+ } else {
+ dout("%s osdc %p onetime\n", __func__, osdc);
}
+
+ if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch + 1, continuous))
+ ceph_monc_renew_subs(&osdc->client->monc);
}
-static void __register_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
{
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
- WARN_ON(!req->r_linger);
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd;
+ enum calc_target_result ct_res;
+ bool need_send = false;
+ bool promoted = false;
+
+ WARN_ON(req->r_tid || req->r_got_reply);
+ dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+ ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+ if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+ goto promote;
+
+ osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+ if (IS_ERR(osd)) {
+ WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+ goto promote;
+ }
+
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("req %p pausewr\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("req %p pauserd\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+ CEPH_OSD_FLAG_FULL_FORCE)) &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.base_oloc.pool))) {
+ dout("req %p full/pool_full\n", req);
+ pr_warn_ratelimited("FULL or reached pool quota\n");
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if (!osd_homeless(osd)) {
+ need_send = true;
+ } else {
+ maybe_request_map(osdc);
+ }
+
+ mutex_lock(&osd->lock);
+ /*
+ * Assign the tid atomically with send_request() to protect
+ * multiple writes to the same object from racing with each
+ * other, resulting in out of order ops on the OSDs.
+ */
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(osd, req);
+ if (need_send)
+ send_request(req);
+ mutex_unlock(&osd->lock);
+ if (ct_res == CALC_TARGET_POOL_DNE)
+ send_map_check(req);
+
+ if (promoted)
+ downgrade_write(&osdc->lock);
+ return;
+
+promote:
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ wrlocked = true;
+ promoted = true;
+ goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+ unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+ if (req->r_flags & CEPH_OSD_FLAG_READ) {
+ WARN_ON(req->r_flags & mask);
+ req->r_flags |= CEPH_OSD_FLAG_ACK;
+ } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+ WARN_ON(!(req->r_flags & mask));
+ else
+ WARN_ON(1);
+
+ WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+ atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
ceph_osdc_get_request(req);
- list_add_tail(&req->r_linger_item, &osdc->req_linger);
- if (req->r_osd)
- list_add_tail(&req->r_linger_osd_item,
- &req->r_osd->o_linger_requests);
+ account_request(req);
+ __submit_request(req, wrlocked);
}
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
{
- WARN_ON(!req->r_linger);
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd = req->r_osd;
- if (list_empty(&req->r_linger_item)) {
- dout("%s %p tid %llu not registered\n", __func__, req,
- req->r_tid);
+ verify_osd_locked(osd);
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+ unlink_request(osd, req);
+ atomic_dec(&osdc->num_requests);
+
+ /*
+ * If an OSD has failed or returned and a request has been sent
+ * twice, it's possible to get a reply and end up here while the
+ * request message is queued for delivery. We will ignore the
+ * reply, so not a big deal, but better to try and catch it.
+ */
+ ceph_msg_revoke(req->r_request);
+ ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+ __finish_request(req);
+ ceph_osdc_put_request(req);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+ if (req->r_callback)
+ req->r_callback(req);
+ else
+ complete_all(&req->r_completion);
+}
+
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+ dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+ req->r_result = err;
+ __finish_request(req);
+ __complete_request(req);
+ complete_all(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (!lookup_req)
return;
+
+ WARN_ON(lookup_req != req);
+ erase_request_mc(&osdc->map_checks, req);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ cancel_map_check(req);
+ finish_request(req);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (req->r_attempts) {
+ /*
+ * We sent a request earlier, which means that
+ * previously the pool existed, and now it does not
+ * (i.e., it was deleted).
+ */
+ req->r_map_dne_bound = map->epoch;
+ dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+ req->r_tid);
+ } else {
+ dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, map->epoch);
+ }
+
+ if (req->r_map_dne_bound) {
+ if (map->epoch >= req->r_map_dne_bound) {
+ /* we had a new enough map */
+ pr_info_ratelimited("tid %llu pool does not exist\n",
+ req->r_tid);
+ complete_request(req, -ENOENT);
+ }
+ } else {
+ send_map_check(req);
}
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_request *req;
+ u64 tid = greq->private_data;
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
- list_del_init(&req->r_linger_item);
+ WARN_ON(greq->result || !greq->u.newest);
- if (req->r_osd) {
- list_del_init(&req->r_linger_osd_item);
- maybe_move_osd_to_lru(osdc, req->r_osd);
- if (list_empty(&req->r_osd_item))
- req->r_osd = NULL;
+ down_write(&osdc->lock);
+ req = lookup_request_mc(&osdc->map_checks, tid);
+ if (!req) {
+ dout("%s tid %llu dne\n", __func__, tid);
+ goto out_unlock;
}
+
+ dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+ if (!req->r_map_dne_bound)
+ req->r_map_dne_bound = greq->u.newest;
+ erase_request_mc(&osdc->map_checks, req);
+ check_pool_dne(req);
+
ceph_osdc_put_request(req);
+out_unlock:
+ up_write(&osdc->lock);
}
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req)
{
- if (!req->r_linger) {
- dout("set_request_linger %p\n", req);
- req->r_linger = 1;
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ return;
}
+
+ ceph_osdc_get_request(req);
+ insert_request_mc(&osdc->map_checks, req);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ map_check_cb, req->r_tid);
+ WARN_ON(ret);
}
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
/*
- * Returns whether a request should be blocked from being sent
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
+ * lingering requests, watch/notify v2 infrastructure
*/
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void linger_release(struct kref *kref)
+{
+ struct ceph_osd_linger_request *lreq =
+ container_of(kref, struct ceph_osd_linger_request, kref);
+
+ dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+ lreq->reg_req, lreq->ping_req);
+ WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+ WARN_ON(!list_empty(&lreq->scan_item));
+ WARN_ON(!list_empty(&lreq->pending_lworks));
+ WARN_ON(lreq->osd);
+
+ if (lreq->reg_req)
+ ceph_osdc_put_request(lreq->reg_req);
+ if (lreq->ping_req)
+ ceph_osdc_put_request(lreq->ping_req);
+ target_destroy(&lreq->t);
+ kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
{
- bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
- bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
- return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ if (lreq)
+ kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+ kref_get(&lreq->kref);
+ return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_linger_request *lreq;
+
+ lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+ if (!lreq)
+ return NULL;
+
+ kref_init(&lreq->kref);
+ mutex_init(&lreq->lock);
+ RB_CLEAR_NODE(&lreq->node);
+ RB_CLEAR_NODE(&lreq->osdc_node);
+ RB_CLEAR_NODE(&lreq->mc_node);
+ INIT_LIST_HEAD(&lreq->scan_item);
+ INIT_LIST_HEAD(&lreq->pending_lworks);
+ init_completion(&lreq->reg_commit_wait);
+ init_completion(&lreq->notify_finish_wait);
+
+ lreq->osdc = osdc;
+ target_init(&lreq->t);
+
+ dout("%s lreq %p\n", __func__, lreq);
+ return lreq;
}
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
/*
- * Calculate mapping of a request to a PG. Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
*/
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
- struct ceph_osd_request *req,
- struct ceph_pg *pg_out)
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
{
- bool need_check_tiering;
+ verify_osd_locked(osd);
+ WARN_ON(!lreq->linger_id || lreq->osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
- need_check_tiering = false;
- if (req->r_target_oloc.pool == -1) {
- req->r_target_oloc = req->r_base_oloc; /* struct */
- need_check_tiering = true;
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_linger(&osd->o_linger_requests, lreq);
+ lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
+{
+ verify_osd_locked(osd);
+ WARN_ON(lreq->osd != osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
+
+ lreq->osd = NULL;
+ erase_linger(&osd->o_linger_requests, lreq);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ verify_osdc_locked(lreq->osdc);
+
+ return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ bool registered;
+
+ down_read(&osdc->lock);
+ registered = __linger_registered(lreq);
+ up_read(&osdc->lock);
+
+ return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(lreq->linger_id);
+
+ linger_get(lreq);
+ lreq->linger_id = ++osdc->last_linger_id;
+ insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+
+ erase_linger_osdc(&osdc->linger_requests, lreq);
+ linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ WARN_ON(!req->r_linger);
+ cancel_request(req);
+ linger_put(lreq);
+}
+
+struct linger_work {
+ struct work_struct work;
+ struct ceph_osd_linger_request *lreq;
+ struct list_head pending_item;
+ unsigned long queued_stamp;
+
+ union {
+ struct {
+ u64 notify_id;
+ u64 notifier_id;
+ void *payload; /* points into @msg front */
+ size_t payload_len;
+
+ struct ceph_msg *msg; /* for ceph_msg_put() */
+ } notify;
+ struct {
+ int err;
+ } error;
+ };
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+ work_func_t workfn)
+{
+ struct linger_work *lwork;
+
+ lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+ if (!lwork)
+ return NULL;
+
+ INIT_WORK(&lwork->work, workfn);
+ INIT_LIST_HEAD(&lwork->pending_item);
+ lwork->lreq = linger_get(lreq);
+
+ return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ mutex_lock(&lreq->lock);
+ list_del(&lwork->pending_item);
+ mutex_unlock(&lreq->lock);
+
+ linger_put(lreq);
+ kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_lreq_locked(lreq);
+ WARN_ON(!list_empty(&lwork->pending_item));
+
+ lwork->queued_stamp = jiffies;
+ list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+ queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
}
- if (req->r_target_oid.name_len == 0) {
- ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
- need_check_tiering = true;
+
+ WARN_ON(!lreq->is_watch);
+ dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+ __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+ lwork->notify.payload_len);
+ lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+ lwork->notify.notifier_id, lwork->notify.payload,
+ lwork->notify.payload_len);
+
+out:
+ ceph_msg_put(lwork->notify.msg);
+ lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
}
- if (need_check_tiering &&
- (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
- struct ceph_pg_pool_info *pi;
-
- pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
- if (pi) {
- if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
- pi->read_tier >= 0)
- req->r_target_oloc.pool = pi->read_tier;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- pi->write_tier >= 0)
- req->r_target_oloc.pool = pi->write_tier;
+ dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+ lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+ lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+ struct linger_work *lwork;
+
+ lwork = lwork_alloc(lreq, do_watch_error);
+ if (!lwork) {
+ pr_err("failed to allocate error-lwork\n");
+ return;
+ }
+
+ lwork->error.err = lreq->last_error;
+ lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+ int result)
+{
+ if (!completion_done(&lreq->reg_commit_wait)) {
+ lreq->reg_commit_error = (result <= 0 ? result : 0);
+ complete_all(&lreq->reg_commit_wait);
+ }
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+ lreq->linger_id, req->r_result);
+ WARN_ON(!__linger_registered(lreq));
+ linger_reg_commit_complete(lreq, req->r_result);
+ lreq->committed = true;
+
+ if (!lreq->is_watch) {
+ struct ceph_osd_data *osd_data =
+ osd_req_op_data(req, 0, notify, response_data);
+ void *p = page_address(osd_data->pages[0]);
+
+ WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+ osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+ /* make note of the notify_id */
+ if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+ lreq->notify_id = ceph_decode_64(&p);
+ dout("lreq %p notify_id %llu\n", lreq,
+ lreq->notify_id);
+ } else {
+ dout("lreq %p no notify_id\n", lreq);
}
- /* !pi is caught in ceph_oloc_oid_to_pg() */
}
- return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
- &req->r_target_oid, pg_out);
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
}
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
{
- struct ceph_osd_client *osdc = req->r_osdc;
+ /*
+ * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+ * notification and a failure to reconnect because we raced with
+ * the delete appear the same to the user.
+ */
+ if (err == -ENOENT)
+ err = -ENOTCONN;
+
+ return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+ lreq, lreq->linger_id, req->r_result, lreq->last_error);
+ if (req->r_result < 0) {
+ if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
+ }
+ }
+
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_request *req = lreq->reg_req;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
- dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
+ verify_osdc_wrlocked(req->r_osdc);
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
- if (req->r_osd) {
- __remove_osd_from_lru(req->r_osd);
- list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
- list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+ if (req->r_osd)
+ cancel_linger_request(req);
+
+ request_reinit(req);
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = lreq->t.flags;
+ req->r_mtime = lreq->mtime;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed) {
+ WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+ op->watch.cookie != lreq->linger_id);
+ op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+ op->watch.gen = ++lreq->register_gen;
+ dout("lreq %p reconnect register_gen %u\n", lreq,
+ op->watch.gen);
+ req->r_callback = linger_reconnect_cb;
} else {
- list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+ if (!lreq->is_watch)
+ lreq->notify_id = 0;
+ else
+ WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+ dout("lreq %p register\n", lreq);
+ req->r_callback = linger_commit_cb;
}
+ mutex_unlock(&lreq->lock);
+
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+
+ submit_request(req, true);
}
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately. If there is
- * no up osd, set r_osd to NULL. Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req, int force_resend)
+static void linger_ping_cb(struct ceph_osd_request *req)
{
- struct ceph_pg pgid;
- int acting[CEPH_PG_MAX_SIZE];
- int num, o;
- int err;
- bool was_paused;
-
- dout("map_request %p tid %lld\n", req, req->r_tid);
-
- err = __calc_request_pg(osdc->osdmap, req, &pgid);
- if (err) {
- list_move(&req->r_req_lru_item, &osdc->req_notarget);
- return err;
- }
- req->r_pgid = pgid;
-
- num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
- if (num < 0)
- num = 0;
-
- was_paused = req->r_paused;
- req->r_paused = __req_should_be_paused(osdc, req);
- if (was_paused && !req->r_paused)
- force_resend = 1;
-
- if ((!force_resend &&
- req->r_osd && req->r_osd->o_osd == o &&
- req->r_sent >= req->r_osd->o_incarnation &&
- req->r_num_pg_osds == num &&
- memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1) ||
- req->r_paused)
- return 0; /* no change */
-
- dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
- req->r_tid, pgid.pool, pgid.seed, o,
- req->r_osd ? req->r_osd->o_osd : -1);
-
- /* record full pg acting set */
- memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
- req->r_num_pg_osds = num;
-
- if (req->r_osd) {
- __cancel_request(req);
- list_del_init(&req->r_osd_item);
- list_del_init(&req->r_linger_osd_item);
- req->r_osd = NULL;
- }
-
- req->r_osd = __lookup_osd(osdc, o);
- if (!req->r_osd && o >= 0) {
- err = -ENOMEM;
- req->r_osd = create_osd(osdc, o);
- if (!req->r_osd) {
- list_move(&req->r_req_lru_item, &osdc->req_notarget);
- goto out;
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+ __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+ lreq->last_error);
+ if (lreq->register_gen == req->r_ops[0].watch.gen) {
+ if (!req->r_result) {
+ lreq->watch_valid_thru = lreq->ping_sent;
+ } else if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
}
+ } else {
+ dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+ lreq->register_gen, req->r_ops[0].watch.gen);
+ }
+
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
- dout("map_request osd %p is osd%d\n", req->r_osd, o);
- __insert_osd(osdc, req->r_osd);
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_request *req = lreq->ping_req;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
- ceph_con_open(&req->r_osd->o_con,
- CEPH_ENTITY_TYPE_OSD, o,
- &osdc->osdmap->osd_addr[o]);
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("%s PAUSERD\n", __func__);
+ return;
}
- __enqueue_request(req);
- err = 1; /* osd or pg changed */
+ lreq->ping_sent = jiffies;
+ dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+ __func__, lreq, lreq->linger_id, lreq->ping_sent,
+ lreq->register_gen);
-out:
- return err;
+ if (req->r_osd)
+ cancel_linger_request(req);
+
+ request_reinit(req);
+ target_copy(&req->r_t, &lreq->t);
+
+ WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+ op->watch.cookie != lreq->linger_id ||
+ op->watch.op != CEPH_OSD_WATCH_OP_PING);
+ op->watch.gen = lreq->register_gen;
+ req->r_callback = linger_ping_cb;
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+
+ ceph_osdc_get_request(req);
+ account_request(req);
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(lreq->osd, req);
+ send_request(req);
}
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void linger_submit(struct ceph_osd_linger_request *lreq)
{
- void *p;
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd *osd;
- dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
- req, req->r_tid, req->r_osd->o_osd, req->r_flags,
- (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+ calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ link_linger(osd, lreq);
- /* fill in message content that changes each time we send it */
- put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
- put_unaligned_le32(req->r_flags, req->r_request_flags);
- put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
- p = req->r_request_pgid;
- ceph_encode_64(&p, req->r_pgid.pool);
- ceph_encode_32(&p, req->r_pgid.seed);
- put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
- memcpy(req->r_request_reassert_version, &req->r_reassert_version,
- sizeof(req->r_reassert_version));
+ send_linger(lreq);
+}
- req->r_stamp = jiffies;
- list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
- ceph_msg_get(req->r_request); /* send consumes a ref */
+ verify_osdc_wrlocked(osdc);
- req->r_sent = req->r_osd->o_incarnation;
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (!lookup_lreq)
+ return;
- ceph_con_send(&req->r_osd->o_con, req->r_request);
+ WARN_ON(lookup_lreq != lreq);
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ linger_put(lreq);
}
/*
- * Send any requests in the queue (req_unsent).
+ * @lreq has to be both registered and linked.
*/
-static void __send_queued(struct ceph_osd_client *osdc)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
{
- struct ceph_osd_request *req, *tmp;
+ if (lreq->is_watch && lreq->ping_req->r_osd)
+ cancel_linger_request(lreq->ping_req);
+ if (lreq->reg_req->r_osd)
+ cancel_linger_request(lreq->reg_req);
+ cancel_linger_map_check(lreq);
+ unlink_linger(lreq->osd, lreq);
+ linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
- dout("__send_queued\n");
- list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
- __send_request(osdc, req);
+ down_write(&osdc->lock);
+ if (__linger_registered(lreq))
+ __linger_cancel(lreq);
+ up_write(&osdc->lock);
}
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
-{
- int rc;
-
- __register_request(osdc, req);
- req->r_sent = 0;
- req->r_got_reply = 0;
- rc = __map_request(osdc, req, 0);
- if (rc < 0) {
- if (nofail) {
- dout("osdc_start_request failed map, "
- " will retry %lld\n", req->r_tid);
- rc = 0;
- } else {
- __unregister_request(osdc, req);
- }
- return rc;
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (lreq->register_gen) {
+ lreq->map_dne_bound = map->epoch;
+ dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+ lreq, lreq->linger_id);
+ } else {
+ dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ map->epoch);
}
- if (req->r_osd == NULL) {
- dout("send_request %p no up osds in pg\n", req);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ if (lreq->map_dne_bound) {
+ if (map->epoch >= lreq->map_dne_bound) {
+ /* we had a new enough map */
+ pr_info("linger_id %llu pool does not exist\n",
+ lreq->linger_id);
+ linger_reg_commit_complete(lreq, -ENOENT);
+ __linger_cancel(lreq);
+ }
} else {
- __send_queued(osdc);
+ send_linger_map_check(lreq);
}
+}
- return 0;
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_linger_request *lreq;
+ u64 linger_id = greq->private_data;
+
+ WARN_ON(greq->result || !greq->u.newest);
+
+ down_write(&osdc->lock);
+ lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+ if (!lreq) {
+ dout("%s linger_id %llu dne\n", __func__, linger_id);
+ goto out_unlock;
+ }
+
+ dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ greq->u.newest);
+ if (!lreq->map_dne_bound)
+ lreq->map_dne_bound = greq->u.newest;
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ check_linger_pool_dne(lreq);
+
+ linger_put(lreq);
+out_unlock:
+ up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (lookup_lreq) {
+ WARN_ON(lookup_lreq != lreq);
+ return;
+ }
+
+ linger_get(lreq);
+ insert_linger_mc(&osdc->linger_map_checks, lreq);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ linger_map_check_cb, lreq->linger_id);
+ WARN_ON(ret);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+ return ret ?: lreq->reg_commit_error;
+}
+
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+ return ret ?: lreq->notify_finish_error;
}
/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds. When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected. Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
+ * Timeout callback, called every N seconds. When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
*/
static void handle_timeout(struct work_struct *work)
{
struct ceph_osd_client *osdc =
container_of(work, struct ceph_osd_client, timeout_work.work);
struct ceph_options *opts = osdc->client->options;
- struct ceph_osd_request *req;
- struct ceph_osd *osd;
- struct list_head slow_osds;
- dout("timeout\n");
- down_read(&osdc->map_sem);
-
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+ LIST_HEAD(slow_osds);
+ struct rb_node *n, *p;
- mutex_lock(&osdc->request_mutex);
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
/*
* ping osds that are a bit slow. this ensures that if there
* is a break in the TCP connection we will notice, and reopen
* a connection with that osd (from the fault callback).
*/
- INIT_LIST_HEAD(&slow_osds);
- list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
- if (time_before(jiffies,
- req->r_stamp + opts->osd_keepalive_timeout))
- break;
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ bool found = false;
+
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (time_before(req->r_stamp, cutoff)) {
+ dout(" req %p tid %llu on osd%d is laggy\n",
+ req, req->r_tid, osd->o_osd);
+ found = true;
+ }
+ }
+ for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(p, struct ceph_osd_linger_request, node);
+
+ dout(" lreq %p linger_id %llu is served by osd%d\n",
+ lreq, lreq->linger_id, osd->o_osd);
+ found = true;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed && !lreq->last_error)
+ send_linger_ping(lreq);
+ mutex_unlock(&lreq->lock);
+ }
- osd = req->r_osd;
- BUG_ON(!osd);
- dout(" tid %llu is slow, will send keepalive on osd%d\n",
- req->r_tid, osd->o_osd);
- list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ if (found)
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
}
+
+ if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+ maybe_request_map(osdc);
+
while (!list_empty(&slow_osds)) {
- osd = list_entry(slow_osds.next, struct ceph_osd,
- o_keepalive_item);
+ struct ceph_osd *osd = list_first_entry(&slow_osds,
+ struct ceph_osd,
+ o_keepalive_item);
list_del_init(&osd->o_keepalive_item);
ceph_con_keepalive(&osd->o_con);
}
- __schedule_osd_timeout(osdc);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ up_write(&osdc->lock);
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
}
static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2541,20 @@ static void handle_osds_timeout(struct work_struct *work)
container_of(work, struct ceph_osd_client,
osds_timeout_work.work);
unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+ struct ceph_osd *osd, *nosd;
- dout("osds timeout\n");
- down_read(&osdc->map_sem);
- remove_old_osds(osdc);
- up_read(&osdc->map_sem);
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (time_before(jiffies, osd->lru_ttl))
+ break;
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ close_osd(osd);
+ }
+
+ up_write(&osdc->lock);
schedule_delayed_work(&osdc->osds_timeout_work,
round_jiffies_relative(delay));
}
@@ -1776,107 +2662,76 @@ e_inval:
goto out;
}
-static void complete_request(struct ceph_osd_request *req)
-{
- complete_all(&req->r_safe_completion); /* fsync waiter */
-}
+struct MOSDOpReply {
+ struct ceph_pg pgid;
+ u64 flags;
+ int result;
+ u32 epoch;
+ int num_ops;
+ u32 outdata_len[CEPH_OSD_MAX_OPS];
+ s32 rval[CEPH_OSD_MAX_OPS];
+ int retry_attempt;
+ struct ceph_eversion replay_version;
+ u64 user_version;
+ struct ceph_request_redirect redirect;
+};
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
{
- void *p, *end;
- struct ceph_osd_request *req;
- struct ceph_request_redirect redir;
- u64 tid;
- int object_len;
- unsigned int numops;
- int payload_len, flags;
- s32 result;
- s32 retry_attempt;
- struct ceph_pg pg;
- int err;
- u32 reassert_epoch;
- u64 reassert_version;
- u32 osdmap_epoch;
- int already_completed;
- u32 bytes;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ u16 version = le16_to_cpu(msg->hdr.version);
+ struct ceph_eversion bad_replay_version;
u8 decode_redir;
- unsigned int i;
-
- tid = le64_to_cpu(msg->hdr.tid);
- dout("handle_reply %p tid %llu\n", msg, tid);
+ u32 len;
+ int ret;
+ int i;
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ ceph_decode_32_safe(&p, end, len, e_inval);
+ ceph_decode_need(&p, end, len, e_inval);
+ p += len; /* skip oid */
- ceph_decode_need(&p, end, 4, bad);
- object_len = ceph_decode_32(&p);
- ceph_decode_need(&p, end, object_len, bad);
- p += object_len;
+ ret = ceph_decode_pgid(&p, end, &m->pgid);
+ if (ret)
+ return ret;
- err = ceph_decode_pgid(&p, end, &pg);
- if (err)
- goto bad;
+ ceph_decode_64_safe(&p, end, m->flags, e_inval);
+ ceph_decode_32_safe(&p, end, m->result, e_inval);
+ ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+ memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+ p += sizeof(bad_replay_version);
+ ceph_decode_32_safe(&p, end, m->epoch, e_inval);
- ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
- flags = ceph_decode_64(&p);
- result = ceph_decode_32(&p);
- reassert_epoch = ceph_decode_32(&p);
- reassert_version = ceph_decode_64(&p);
- osdmap_epoch = ceph_decode_32(&p);
-
- /* lookup */
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (req == NULL) {
- dout("handle_reply tid %llu dne\n", tid);
- goto bad_mutex;
- }
- ceph_osdc_get_request(req);
+ ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+ if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+ goto e_inval;
- dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
- req, result);
-
- ceph_decode_need(&p, end, 4, bad_put);
- numops = ceph_decode_32(&p);
- if (numops > CEPH_OSD_MAX_OPS)
- goto bad_put;
- if (numops != req->r_num_ops)
- goto bad_put;
- payload_len = 0;
- ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
- for (i = 0; i < numops; i++) {
+ ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+ e_inval);
+ for (i = 0; i < m->num_ops; i++) {
struct ceph_osd_op *op = p;
- int len;
- len = le32_to_cpu(op->payload_len);
- req->r_ops[i].outdata_len = len;
- dout(" op %d has %d bytes\n", i, len);
- payload_len += len;
+ m->outdata_len[i] = le32_to_cpu(op->payload_len);
p += sizeof(*op);
}
- bytes = le32_to_cpu(msg->hdr.data_len);
- if (payload_len != bytes) {
- pr_warn("sum of op payload lens %d != data_len %d\n",
- payload_len, bytes);
- goto bad_put;
- }
- ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
- retry_attempt = ceph_decode_32(&p);
- for (i = 0; i < numops; i++)
- req->r_ops[i].rval = ceph_decode_32(&p);
+ ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+ for (i = 0; i < m->num_ops; i++)
+ ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
- if (le16_to_cpu(msg->hdr.version) >= 6) {
- p += 8 + 4; /* skip replay_version */
- p += 8; /* skip user_version */
+ if (version >= 5) {
+ ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+ memcpy(&m->replay_version, p, sizeof(m->replay_version));
+ p += sizeof(m->replay_version);
+ ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+ } else {
+ m->replay_version = bad_replay_version; /* struct */
+ m->user_version = le64_to_cpu(m->replay_version.version);
+ }
- if (le16_to_cpu(msg->hdr.version) >= 7)
- ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+ if (version >= 6) {
+ if (version >= 7)
+ ceph_decode_8_safe(&p, end, decode_redir, e_inval);
else
decode_redir = 1;
} else {
@@ -1884,228 +2739,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
}
if (decode_redir) {
- err = ceph_redirect_decode(&p, end, &redir);
- if (err)
- goto bad_put;
+ ret = ceph_redirect_decode(&p, end, &m->redirect);
+ if (ret)
+ return ret;
} else {
- redir.oloc.pool = -1;
+ ceph_oloc_init(&m->redirect.oloc);
}
- if (redir.oloc.pool != -1) {
- dout("redirect pool %lld\n", redir.oloc.pool);
+ return 0;
- __unregister_request(osdc, req);
+e_inval:
+ return -EINVAL;
+}
- req->r_target_oloc = redir.oloc; /* struct */
+/*
+ * We are done with @req if
+ * - @m is a safe reply, or
+ * - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+ const struct MOSDOpReply *m)
+{
+ return (m->result < 0 ||
+ (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+ !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
- /*
- * Start redirect requests with nofail=true. If
- * mapping fails, request will end up on the notarget
- * list, waiting for the new osdmap (which can take
- * a while), even though the original request mapped
- * successfully. In the future we might want to follow
- * original request's nofail setting here.
- */
- err = __ceph_osdc_start_request(osdc, req, true);
- BUG_ON(err);
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set? yes no
+ *
+ * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
+ * any or needed/got safe) r_safe_completion r_safe_completion
+ *
+ * first reply is unsafe r_unsafe_cb(true) (nothing)
+ *
+ * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
+ * r_safe_completion r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_osd_request *req;
+ struct MOSDOpReply m;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ u32 data_len = 0;
+ bool already_acked;
+ int ret;
+ int i;
- goto out_unlock;
- }
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
- already_completed = req->r_got_reply;
- if (!req->r_got_reply) {
- req->r_result = result;
- dout("handle_reply result %d bytes %d\n", req->r_result,
- bytes);
- if (req->r_result == 0)
- req->r_result = bytes;
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
- /* in case this is a write and we need to replay, */
- req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
- req->r_reassert_version.version = cpu_to_le64(reassert_version);
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
+ if (!req) {
+ dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+ goto out_unlock_session;
+ }
- req->r_got_reply = 1;
- } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
- dout("handle_reply tid %llu dup ack\n", tid);
- goto out_unlock;
+ ret = decode_MOSDOpReply(msg, &m);
+ if (ret) {
+ pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+ req->r_tid, ret);
+ ceph_msg_dump(msg);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+ __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+ m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+ le64_to_cpu(m.replay_version.version), m.user_version);
+
+ if (m.retry_attempt >= 0) {
+ if (m.retry_attempt != req->r_attempts - 1) {
+ dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+ req, req->r_tid, m.retry_attempt,
+ req->r_attempts - 1);
+ goto out_unlock_session;
+ }
+ } else {
+ WARN_ON(1); /* MOSDOpReply v4 is assumed */
}
- dout("handle_reply tid %llu flags %d\n", tid, flags);
+ if (!ceph_oloc_empty(&m.redirect.oloc)) {
+ dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+ m.redirect.oloc.pool);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
- if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
- __register_linger_request(osdc, req);
+ if (m.num_ops != req->r_num_ops) {
+ pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+ req->r_num_ops, req->r_tid);
+ goto fail_request;
+ }
+ for (i = 0; i < req->r_num_ops; i++) {
+ dout(" req %p tid %llu op %d rval %d len %u\n", req,
+ req->r_tid, i, m.rval[i], m.outdata_len[i]);
+ req->r_ops[i].rval = m.rval[i];
+ req->r_ops[i].outdata_len = m.outdata_len[i];
+ data_len += m.outdata_len[i];
+ }
+ if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+ pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+ le32_to_cpu(msg->hdr.data_len), req->r_tid);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+ req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+ already_acked = req->r_got_reply;
+ if (!already_acked) {
+ req->r_result = m.result ?: data_len;
+ req->r_replay_version = m.replay_version; /* struct */
+ req->r_got_reply = true;
+ } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+ dout("req %p tid %llu dup ack\n", req, req->r_tid);
+ goto out_unlock_session;
+ }
- /* either this is a read, or we got the safe response */
- if (result < 0 ||
- (flags & CEPH_OSD_FLAG_ONDISK) ||
- ((flags & CEPH_OSD_FLAG_WRITE) == 0))
- __unregister_request(osdc, req);
+ if (done_request(req, &m)) {
+ __finish_request(req);
+ if (req->r_linger) {
+ WARN_ON(req->r_unsafe_callback);
+ dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+ __complete_request(req);
+ }
+ }
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
- if (!already_completed) {
- if (req->r_unsafe_callback &&
- result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+ if (done_request(req, &m)) {
+ if (already_acked && req->r_unsafe_callback) {
+ dout("req %p tid %llu safe-cb\n", req, req->r_tid);
+ req->r_unsafe_callback(req, false);
+ } else if (!req->r_linger) {
+ dout("req %p tid %llu cb\n", req, req->r_tid);
+ __complete_request(req);
+ }
+ if (m.flags & CEPH_OSD_FLAG_ONDISK)
+ complete_all(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+ } else {
+ if (req->r_unsafe_callback) {
+ dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
req->r_unsafe_callback(req, true);
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
+ } else {
+ WARN_ON(1);
+ }
}
- if (flags & CEPH_OSD_FLAG_ONDISK) {
- if (req->r_unsafe_callback && already_completed)
- req->r_unsafe_callback(req, false);
- complete_request(req);
+ return;
+
+fail_request:
+ complete_request(req, -EIO);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+}
+
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ pi->was_full = __pool_full(pi);
}
+}
-out:
- dout("req=%p req->r_linger=%d\n", req, req->r_linger);
- ceph_osdc_put_request(req);
- return;
-out_unlock:
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
- goto out;
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
-bad_put:
- req->r_result = -EIO;
- __unregister_request(osdc, req);
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
- complete_request(req);
- ceph_osdc_put_request(req);
-bad_mutex:
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
-bad:
- pr_err("corrupt osd_op_reply got %d %d\n",
- (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
- ceph_msg_dump(msg);
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return pi->was_full && !__pool_full(pi);
}
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
{
- struct rb_node *p, *n;
+ struct ceph_osd_client *osdc = lreq->osdc;
+ enum calc_target_result ct_res;
- dout("%s %p\n", __func__, osdc);
- for (p = rb_first(&osdc->osds); p; p = n) {
- struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+ ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+ if (ct_res == CALC_TARGET_NEED_RESEND) {
+ struct ceph_osd *osd;
- n = rb_next(p);
- if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
- memcmp(&osd->o_con.peer_addr,
- ceph_osd_addr(osdc->osdmap,
- osd->o_osd),
- sizeof(struct ceph_entity_addr)) != 0)
- __reset_osd(osdc, osd);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ if (osd != lreq->osd) {
+ unlink_linger(lreq->osd, lreq);
+ link_linger(osd, lreq);
+ }
}
+
+ return ct_res;
}
/*
- * Requeue requests whose mapping to an OSD has changed. If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
*/
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
- bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+ bool force_resend,
+ bool cleared_full,
+ bool check_pool_cleared_full,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
{
- struct ceph_osd_request *req, *nreq;
- struct rb_node *p;
- int needmap = 0;
- int err;
- bool force_resend_req;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
+ bool force_resend_writes;
+
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* recalc_linger_target() */
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+ lreq->linger_id);
+ ct_res = recalc_linger_target(lreq);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+ if (!force_resend && !force_resend_writes)
+ break;
+
+ /* fall through */
+ case CALC_TARGET_NEED_RESEND:
+ cancel_linger_map_check(lreq);
+ /*
+ * scan_requests() for the previous epoch(s)
+ * may have already added it to the list, since
+ * it's not unlinked here.
+ */
+ if (list_empty(&lreq->scan_item))
+ list_add_tail(&lreq->scan_item, need_resend_linger);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_linger_pool_dne(lreq);
+ break;
+ }
+ }
- dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
- force_resend_writes ? " (force resend writes)" : "");
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; ) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
- p = rb_next(p);
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ ct_res = calc_target(osdc, &req->r_t,
+ &req->r_last_force_resend, false);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+ if (!force_resend &&
+ (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+ !force_resend_writes))
+ break;
+
+ /* fall through */
+ case CALC_TARGET_NEED_RESEND:
+ cancel_map_check(req);
+ unlink_request(osd, req);
+ insert_request(need_resend, req);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_pool_dne(req);
+ break;
+ }
+ }
+}
+static int handle_one_map(struct ceph_osd_client *osdc,
+ void *p, void *end, bool incremental,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osdmap *newmap;
+ struct rb_node *n;
+ bool skipped_map = false;
+ bool was_full;
+
+ was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ set_pool_was_full(osdc);
+
+ if (incremental)
+ newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+ else
+ newmap = ceph_osdmap_decode(&p, end);
+ if (IS_ERR(newmap))
+ return PTR_ERR(newmap);
+
+ if (newmap != osdc->osdmap) {
/*
- * For linger requests that have not yet been
- * registered, move them to the linger list; they'll
- * be sent to the osd in the loop below. Unregister
- * the request before re-registering it as a linger
- * request to ensure the __map_request() below
- * will decide it needs to be sent.
+ * Preserve ->was_full before destroying the old map.
+ * For pools that weren't in the old map, ->was_full
+ * should be false.
*/
- if (req->r_linger && list_empty(&req->r_linger_item)) {
- dout("%p tid %llu restart on osd%d\n",
- req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- ceph_osdc_get_request(req);
- __unregister_request(osdc, req);
- __register_linger_request(osdc, req);
- ceph_osdc_put_request(req);
- continue;
+ for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+ struct ceph_pg_pool_info *old_pi;
+
+ old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+ if (old_pi)
+ pi->was_full = old_pi->was_full;
+ else
+ WARN_ON(pi->was_full);
}
- force_resend_req = force_resend ||
- (force_resend_writes &&
- req->r_flags & CEPH_OSD_FLAG_WRITE);
- err = __map_request(osdc, req, force_resend_req);
- if (err < 0)
- continue; /* error */
- if (req->r_osd == NULL) {
- dout("%p tid %llu maps to no osd\n", req, req->r_tid);
- needmap++; /* request a newer map */
- } else if (err > 0) {
- if (!req->r_linger) {
- dout("%p tid %llu requeued on osd%d\n", req,
- req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- }
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 < newmap->epoch) {
+ WARN_ON(incremental);
+ skipped_map = true;
}
+
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
}
- list_for_each_entry_safe(req, nreq, &osdc->req_linger,
- r_linger_item) {
- dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
- err = __map_request(osdc, req,
- force_resend || force_resend_writes);
- dout("__map_request returned %d\n", err);
- if (err < 0)
- continue; /* hrm! */
- if (req->r_osd == NULL || err > 0) {
- if (req->r_osd == NULL) {
- dout("lingering %p tid %llu maps to no osd\n",
- req, req->r_tid);
- /*
- * A homeless lingering request makes
- * no sense, as it's job is to keep
- * a particular OSD connection open.
- * Request a newer map and kick the
- * request, knowing that it won't be
- * resent until we actually get a map
- * that can tell us where to send it.
- */
- needmap++;
- }
+ was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+ need_resend, need_resend_linger);
+
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n); /* close_osd() */
+
+ scan_requests(osd, skipped_map, was_full, true, need_resend,
+ need_resend_linger);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap, osd->o_osd),
+ sizeof(struct ceph_entity_addr)))
+ close_osd(osd);
+ }
- dout("kicking lingering %p tid %llu osd%d\n", req,
- req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
- __register_request(osdc, req);
- __unregister_linger_request(osdc, req);
+ return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osd_linger_request *lreq, *nlreq;
+ struct rb_node *n;
+
+ for (n = rb_first(need_resend); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ struct ceph_osd *osd;
+
+ n = rb_next(n);
+ erase_request(need_resend, req); /* before link_request() */
+
+ WARN_ON(req->r_osd);
+ calc_target(osdc, &req->r_t, NULL, false);
+ osd = lookup_create_osd(osdc, req->r_t.osd, true);
+ link_request(osd, req);
+ if (!req->r_linger) {
+ if (!osd_homeless(osd) && !req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
}
}
- reset_changed_osds(osdc);
- mutex_unlock(&osdc->request_mutex);
- if (needmap) {
- dout("%d requests for down osds, need new map\n", needmap);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+ if (!osd_homeless(lreq->osd))
+ send_linger(lreq);
+
+ list_del_init(&lreq->scan_item);
}
}
-
/*
* Process updated osd map.
*
@@ -2115,27 +3152,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
*/
void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
{
- void *p, *end, *next;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
u32 nr_maps, maplen;
u32 epoch;
- struct ceph_osdmap *newmap = NULL, *oldmap;
- int err;
struct ceph_fsid fsid;
- bool was_full;
+ struct rb_root need_resend = RB_ROOT;
+ LIST_HEAD(need_resend_linger);
+ bool handled_incremental = false;
+ bool was_pauserd, was_pausewr;
+ bool pauserd, pausewr;
+ int err;
- dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+ down_write(&osdc->lock);
/* verify fsid */
ceph_decode_need(&p, end, sizeof(fsid), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
if (ceph_check_fsid(osdc->client, &fsid) < 0)
- return;
-
- down_write(&osdc->map_sem);
+ goto bad;
- was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+ was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
/* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3186,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
ceph_decode_need(&p, end, maplen, bad);
- next = p + maplen;
- if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 == epoch) {
dout("applying incremental map %u len %d\n",
epoch, maplen);
- newmap = osdmap_apply_incremental(&p, next,
- osdc->osdmap,
- &osdc->client->msgr);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
+ err = handle_one_map(osdc, p, p + maplen, true,
+ &need_resend, &need_resend_linger);
+ if (err)
goto bad;
- }
- BUG_ON(!newmap);
- if (newmap != osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = newmap;
- }
- was_full = was_full ||
- ceph_osdmap_flag(osdc->osdmap,
- CEPH_OSDMAP_FULL);
- kick_requests(osdc, 0, was_full);
+ handled_incremental = true;
} else {
dout("ignoring incremental map %u len %d\n",
epoch, maplen);
}
- p = next;
+ p += maplen;
nr_maps--;
}
- if (newmap)
+ if (handled_incremental)
goto done;
/* full maps */
@@ -2186,455 +3216,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
if (nr_maps > 1) {
dout("skipping non-latest full map %u len %d\n",
epoch, maplen);
- } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ } else if (osdc->osdmap->epoch >= epoch) {
dout("skipping full map %u len %d, "
"older than our %u\n", epoch, maplen,
osdc->osdmap->epoch);
} else {
- int skipped_map = 0;
-
dout("taking full map %u len %d\n", epoch, maplen);
- newmap = ceph_osdmap_decode(&p, p+maplen);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
+ err = handle_one_map(osdc, p, p + maplen, false,
+ &need_resend, &need_resend_linger);
+ if (err)
goto bad;
- }
- BUG_ON(!newmap);
- oldmap = osdc->osdmap;
- osdc->osdmap = newmap;
- if (oldmap) {
- if (oldmap->epoch + 1 < newmap->epoch)
- skipped_map = 1;
- ceph_osdmap_destroy(oldmap);
- }
- was_full = was_full ||
- ceph_osdmap_flag(osdc->osdmap,
- CEPH_OSDMAP_FULL);
- kick_requests(osdc, skipped_map, was_full);
}
p += maplen;
nr_maps--;
}
- if (!osdc->osdmap)
- goto bad;
done:
- downgrade_write(&osdc->map_sem);
- ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
- osdc->osdmap->epoch);
-
/*
* subscribe to subsequent osdmap updates if full to ensure
* we find out when we are no longer full and stop returning
* ENOSPC.
*/
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
- ceph_monc_request_next_osdmap(&osdc->client->monc);
-
- mutex_lock(&osdc->request_mutex);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
+ if (was_pauserd || was_pausewr || pauserd || pausewr)
+ maybe_request_map(osdc);
+
+ kick_requests(osdc, &need_resend, &need_resend_linger);
+
+ ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch);
+ up_write(&osdc->lock);
wake_up_all(&osdc->client->auth_wq);
return;
bad:
pr_err("osdc handle_map corrupt msg\n");
ceph_msg_dump(msg);
- up_write(&osdc->map_sem);
+ up_write(&osdc->lock);
}
/*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
+ * Resubmit requests pending on the given osd.
*/
-static void __release_event(struct kref *kref)
+static void kick_osd_requests(struct ceph_osd *osd)
{
- struct ceph_osd_event *event =
- container_of(kref, struct ceph_osd_event, kref);
+ struct rb_node *n;
- dout("__release_event %p\n", event);
- kfree(event);
-}
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
-static void get_event(struct ceph_osd_event *event)
-{
- kref_get(&event->kref);
-}
+ n = rb_next(n); /* cancel_linger_request() */
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
- kref_put(&event->kref, __release_event);
+ if (!req->r_linger) {
+ if (!req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
+ }
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ send_linger(lreq);
+ }
}
-EXPORT_SYMBOL(ceph_osdc_put_event);
-static void __insert_event(struct ceph_osd_client *osdc,
- struct ceph_osd_event *new)
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
{
- struct rb_node **p = &osdc->event_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_event *event = NULL;
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
- while (*p) {
- parent = *p;
- event = rb_entry(parent, struct ceph_osd_event, node);
- if (new->cookie < event->cookie)
- p = &(*p)->rb_left;
- else if (new->cookie > event->cookie)
- p = &(*p)->rb_right;
- else
- BUG();
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ down_write(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock;
}
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &osdc->event_tree);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ maybe_request_map(osdc);
+
+out_unlock:
+ up_write(&osdc->lock);
}
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
- u64 cookie)
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg)
{
- struct rb_node **p = &osdc->event_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_event *event = NULL;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ struct ceph_osd_linger_request *lreq;
+ struct linger_work *lwork;
+ u8 proto_ver, opcode;
+ u64 cookie, notify_id;
+ u64 notifier_id = 0;
+ s32 return_code = 0;
+ void *payload = NULL;
+ u32 payload_len = 0;
- while (*p) {
- parent = *p;
- event = rb_entry(parent, struct ceph_osd_event, node);
- if (cookie < event->cookie)
- p = &(*p)->rb_left;
- else if (cookie > event->cookie)
- p = &(*p)->rb_right;
- else
- return event;
+ ceph_decode_8_safe(&p, end, proto_ver, bad);
+ ceph_decode_8_safe(&p, end, opcode, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+ p += 8; /* skip ver */
+ ceph_decode_64_safe(&p, end, notify_id, bad);
+
+ if (proto_ver >= 1) {
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+ payload = p;
+ p += payload_len;
}
- return NULL;
-}
-static void __remove_event(struct ceph_osd_event *event)
-{
- struct ceph_osd_client *osdc = event->osdc;
+ if (le16_to_cpu(msg->hdr.version) >= 2)
+ ceph_decode_32_safe(&p, end, return_code, bad);
+
+ if (le16_to_cpu(msg->hdr.version) >= 3)
+ ceph_decode_64_safe(&p, end, notifier_id, bad);
- if (!RB_EMPTY_NODE(&event->node)) {
- dout("__remove_event removed %p\n", event);
- rb_erase(&event->node, &osdc->event_tree);
- ceph_osdc_put_event(event);
+ down_read(&osdc->lock);
+ lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+ if (!lreq) {
+ dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+ cookie);
+ goto out_unlock_osdc;
+ }
+
+ mutex_lock(&lreq->lock);
+ dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+ opcode, cookie, lreq, lreq->is_watch);
+ if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+ if (!lreq->last_error) {
+ lreq->last_error = -ENOTCONN;
+ queue_watch_error(lreq);
+ }
+ } else if (!lreq->is_watch) {
+ /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+ if (lreq->notify_id && lreq->notify_id != notify_id) {
+ dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+ lreq->notify_id, notify_id);
+ } else if (!completion_done(&lreq->notify_finish_wait)) {
+ struct ceph_msg_data *data =
+ list_first_entry_or_null(&msg->data,
+ struct ceph_msg_data,
+ links);
+
+ if (data) {
+ if (lreq->preply_pages) {
+ WARN_ON(data->type !=
+ CEPH_MSG_DATA_PAGES);
+ *lreq->preply_pages = data->pages;
+ *lreq->preply_len = data->length;
+ } else {
+ ceph_release_page_vector(data->pages,
+ calc_pages_for(0, data->length));
+ }
+ }
+ lreq->notify_finish_error = return_code;
+ complete_all(&lreq->notify_finish_wait);
+ }
} else {
- dout("__remove_event didn't remove %p\n", event);
+ /* CEPH_WATCH_EVENT_NOTIFY */
+ lwork = lwork_alloc(lreq, do_watch_notify);
+ if (!lwork) {
+ pr_err("failed to allocate notify-lwork\n");
+ goto out_unlock_lreq;
+ }
+
+ lwork->notify.notify_id = notify_id;
+ lwork->notify.notifier_id = notifier_id;
+ lwork->notify.payload = payload;
+ lwork->notify.payload_len = payload_len;
+ lwork->notify.msg = ceph_msg_get(msg);
+ lwork_queue(lwork);
}
+
+out_unlock_lreq:
+ mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return;
+
+bad:
+ pr_err("osdc handle_watch_notify corrupt msg\n");
}
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
- void (*event_cb)(u64, u64, u8, void *),
- void *data, struct ceph_osd_event **pevent)
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
{
- struct ceph_osd_event *event;
+ down_read(&osdc->lock);
+ submit_request(req, false);
+ up_read(&osdc->lock);
- event = kmalloc(sizeof(*event), GFP_NOIO);
- if (!event)
- return -ENOMEM;
-
- dout("create_event %p\n", event);
- event->cb = event_cb;
- event->one_shot = 0;
- event->data = data;
- event->osdc = osdc;
- INIT_LIST_HEAD(&event->osd_node);
- RB_CLEAR_NODE(&event->node);
- kref_init(&event->kref); /* one ref for us */
- kref_get(&event->kref); /* one ref for the caller */
-
- spin_lock(&osdc->event_lock);
- event->cookie = ++osdc->event_count;
- __insert_event(osdc, event);
- spin_unlock(&osdc->event_lock);
-
- *pevent = event;
return 0;
}
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_start_request);
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request. The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
{
- struct ceph_osd_client *osdc = event->osdc;
+ struct ceph_osd_client *osdc = req->r_osdc;
- dout("cancel_event %p\n", event);
- spin_lock(&osdc->event_lock);
- __remove_event(event);
- spin_unlock(&osdc->event_lock);
- ceph_osdc_put_event(event); /* caller's */
+ down_write(&osdc->lock);
+ if (req->r_osd)
+ cancel_request(req);
+ up_write(&osdc->lock);
}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
-
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
-static void do_event_work(struct work_struct *work)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+ unsigned long timeout)
{
- struct ceph_osd_event_work *event_work =
- container_of(work, struct ceph_osd_event_work, work);
- struct ceph_osd_event *event = event_work->event;
- u64 ver = event_work->ver;
- u64 notify_id = event_work->notify_id;
- u8 opcode = event_work->opcode;
+ long left;
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ left = wait_for_completion_killable_timeout(&req->r_completion,
+ ceph_timeout_jiffies(timeout));
+ if (left <= 0) {
+ left = left ?: -ETIMEDOUT;
+ ceph_osdc_cancel_request(req);
- dout("do_event_work completing %p\n", event);
- event->cb(ver, notify_id, opcode, event->data);
- dout("do_event_work completed %p\n", event);
- ceph_osdc_put_event(event);
- kfree(event_work);
+ /* kludge - need to to wake ceph_osdc_sync() */
+ complete_all(&req->r_safe_completion);
+ } else {
+ left = req->r_result; /* completed */
+ }
+
+ return left;
}
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
- * Process osd watch notifications
+ * sync - wait for all in-flight requests to flush. avoid starvation.
*/
-static void handle_watch_notify(struct ceph_osd_client *osdc,
- struct ceph_msg *msg)
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
{
- void *p, *end;
- u8 proto_ver;
- u64 cookie, ver, notify_id;
- u8 opcode;
- struct ceph_osd_event *event;
- struct ceph_osd_event_work *event_work;
+ struct rb_node *n, *p;
+ u64 last_tid = atomic64_read(&osdc->last_tid);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+again:
+ down_read(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
- ceph_decode_8_safe(&p, end, proto_ver, bad);
- ceph_decode_8_safe(&p, end, opcode, bad);
- ceph_decode_64_safe(&p, end, cookie, bad);
- ceph_decode_64_safe(&p, end, ver, bad);
- ceph_decode_64_safe(&p, end, notify_id, bad);
+ mutex_lock(&osd->lock);
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_tid > last_tid)
+ break;
- spin_lock(&osdc->event_lock);
- event = __find_event(osdc, cookie);
- if (event) {
- BUG_ON(event->one_shot);
- get_event(event);
- }
- spin_unlock(&osdc->event_lock);
- dout("handle_watch_notify cookie %lld ver %lld event %p\n",
- cookie, ver, event);
- if (event) {
- event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
- if (!event_work) {
- pr_err("couldn't allocate event_work\n");
- ceph_osdc_put_event(event);
- return;
+ if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
+ dout("%s waiting on req %p tid %llu last_tid %llu\n",
+ __func__, req, req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+ goto again;
}
- INIT_WORK(&event_work->work, do_event_work);
- event_work->event = event;
- event_work->ver = ver;
- event_work->notify_id = notify_id;
- event_work->opcode = opcode;
- queue_work(osdc->notify_wq, &event_work->work);
+ mutex_unlock(&osd->lock);
}
- return;
+ up_read(&osdc->lock);
+ dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
-bad:
- pr_err("osdc handle_watch_notify corrupt msg\n");
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_request *req;
+
+ req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return NULL;
+
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+
+ if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+
+ return req;
}
/*
- * build new request AND message
- *
+ * Returns a handle, caller owns a ref.
*/
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
- struct ceph_snap_context *snapc, u64 snap_id,
- struct timespec *mtime)
-{
- struct ceph_msg *msg = req->r_request;
- void *p;
- size_t msg_size;
- int flags = req->r_flags;
- u64 data_len;
- unsigned int i;
-
- req->r_snapid = snap_id;
- req->r_snapc = ceph_get_snap_context(snapc);
-
- /* encode request */
- msg->hdr.version = cpu_to_le16(4);
-
- p = msg->front.iov_base;
- ceph_encode_32(&p, 1); /* client_inc is always 1 */
- req->r_request_osdmap_epoch = p;
- p += 4;
- req->r_request_flags = p;
- p += 4;
- if (req->r_flags & CEPH_OSD_FLAG_WRITE)
- ceph_encode_timespec(p, mtime);
- p += sizeof(struct ceph_timespec);
- req->r_request_reassert_version = p;
- p += sizeof(struct ceph_eversion); /* will get filled in */
-
- /* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
- req->r_request_pool = p;
- p += 8;
- ceph_encode_32(&p, -1); /* preferred */
- ceph_encode_32(&p, 0); /* key len */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data)
+{
+ struct ceph_osd_linger_request *lreq;
+ int ret;
- ceph_encode_8(&p, 1);
- req->r_request_pgid = p;
- p += 8 + 4;
- ceph_encode_32(&p, -1); /* preferred */
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return ERR_PTR(-ENOMEM);
- /* oid */
- ceph_encode_32(&p, req->r_base_oid.name_len);
- memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
- dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
- req->r_base_oid.name, req->r_base_oid.name_len);
- p += req->r_base_oid.name_len;
-
- /* ops--can imply data */
- ceph_encode_16(&p, (u16)req->r_num_ops);
- data_len = 0;
- for (i = 0; i < req->r_num_ops; i++) {
- data_len += osd_req_encode_op(req, p, i);
- p += sizeof(struct ceph_osd_op);
+ lreq->is_watch = true;
+ lreq->wcb = wcb;
+ lreq->errcb = errcb;
+ lreq->data = data;
+ lreq->watch_valid_thru = jiffies;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ lreq->mtime = CURRENT_TIME;
+
+ lreq->reg_req = alloc_linger_request(lreq);
+ if (!lreq->reg_req) {
+ ret = -ENOMEM;
+ goto err_put_lreq;
}
- /* snaps */
- ceph_encode_64(&p, req->r_snapid);
- ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
- ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
- if (req->r_snapc) {
- for (i = 0; i < snapc->num_snaps; i++) {
- ceph_encode_64(&p, req->r_snapc->snaps[i]);
- }
+ lreq->ping_req = alloc_linger_request(lreq);
+ if (!lreq->ping_req) {
+ ret = -ENOMEM;
+ goto err_put_lreq;
}
- req->r_request_attempts = p;
- p += 4;
-
- /* data */
- if (flags & CEPH_OSD_FLAG_WRITE) {
- u16 data_off;
-
- /*
- * The header "data_off" is a hint to the receiver
- * allowing it to align received data into its
- * buffers such that there's no need to re-copy
- * it before writing it to disk (direct I/O).
- */
- data_off = (u16) (off & 0xffff);
- req->r_request->hdr.data_off = cpu_to_le16(data_off);
+ down_write(&osdc->lock);
+ linger_register(lreq); /* before osd_req_op_* */
+ osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_WATCH);
+ osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_PING);
+ linger_submit(lreq);
+ up_write(&osdc->lock);
+
+ ret = linger_reg_commit_wait(lreq);
+ if (ret) {
+ linger_cancel(lreq);
+ goto err_put_lreq;
}
- req->r_request->hdr.data_len = cpu_to_le32(data_len);
- BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
- msg_size = p - msg->front.iov_base;
- msg->front.iov_len = msg_size;
- msg->hdr.front_len = cpu_to_le32(msg_size);
+ return lreq;
- dout("build_request msg_size was %d\n", (int)msg_size);
+err_put_lreq:
+ linger_put(lreq);
+ return ERR_PTR(ret);
}
-EXPORT_SYMBOL(ceph_osdc_build_request);
+EXPORT_SYMBOL(ceph_osdc_watch);
/*
- * Register request, send initial attempt.
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
*/
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
{
- int rc;
+ struct ceph_options *opts = osdc->client->options;
+ struct ceph_osd_request *req;
+ int ret;
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
- rc = __ceph_osdc_start_request(osdc, req, nofail);
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ req->r_mtime = CURRENT_TIME;
+ osd_req_op_watch_init(req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_UNWATCH);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
- return rc;
+ ceph_osdc_start_request(osdc, req, false);
+ linger_cancel(lreq);
+ linger_put(lreq);
+ ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
}
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_unwatch);
-/*
- * Unregister a registered request. The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+ u64 notify_id, u64 cookie, void *payload,
+ size_t payload_len)
{
- struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
+
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
- mutex_lock(&osdc->request_mutex);
- if (req->r_linger)
- __unregister_linger_request(osdc, req);
- __unregister_request(osdc, req);
- mutex_unlock(&osdc->request_mutex);
+ ceph_pagelist_init(pl);
+ ret = ceph_pagelist_encode_64(pl, notify_id);
+ ret |= ceph_pagelist_encode_64(pl, cookie);
+ if (payload) {
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ } else {
+ ret |= ceph_pagelist_encode_32(pl, 0);
+ }
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
+ }
- dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+ ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ size_t payload_len)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+ payload_len);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+ u64 cookie, u32 prot_ver, u32 timeout,
+ void *payload, size_t payload_len)
{
- int rc;
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+ op->notify.cookie = cookie;
- rc = wait_for_completion_interruptible(&req->r_completion);
- if (rc < 0) {
- dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
- ceph_osdc_cancel_request(req);
- complete_request(req);
- return rc;
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
+
+ ceph_pagelist_init(pl);
+ ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+ ret |= ceph_pagelist_encode_32(pl, timeout);
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
}
- dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
- req->r_result);
- return req->r_result;
+ ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
- * sync - wait for all in-flight requests to flush. avoid starvation.
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
*/
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ size_t payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len)
{
- struct ceph_osd_request *req;
- u64 last_tid, next_tid = 0;
+ struct ceph_osd_linger_request *lreq;
+ struct page **pages;
+ int ret;
- mutex_lock(&osdc->request_mutex);
- last_tid = osdc->last_tid;
- while (1) {
- req = __lookup_request_ge(osdc, next_tid);
- if (!req)
- break;
- if (req->r_tid > last_tid)
- break;
+ WARN_ON(!timeout);
+ if (preply_pages) {
+ *preply_pages = NULL;
+ *preply_len = 0;
+ }
- next_tid = req->r_tid + 1;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
- continue;
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return -ENOMEM;
- ceph_osdc_get_request(req);
- mutex_unlock(&osdc->request_mutex);
- dout("sync waiting on tid %llu (last is %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&osdc->request_mutex);
- ceph_osdc_put_request(req);
+ lreq->preply_pages = preply_pages;
+ lreq->preply_len = preply_len;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+ lreq->reg_req = alloc_linger_request(lreq);
+ if (!lreq->reg_req) {
+ ret = -ENOMEM;
+ goto out_put_lreq;
}
- mutex_unlock(&osdc->request_mutex);
- dout("sync done (thru tid %llu)\n", last_tid);
+
+ /* for notify_id */
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out_put_lreq;
+ }
+
+ down_write(&osdc->lock);
+ linger_register(lreq); /* before osd_req_op_* */
+ ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+ timeout, payload, payload_len);
+ if (ret) {
+ linger_unregister(lreq);
+ up_write(&osdc->lock);
+ ceph_release_page_vector(pages, 1);
+ goto out_put_lreq;
+ }
+ ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+ response_data),
+ pages, PAGE_SIZE, 0, false, true);
+ linger_submit(lreq);
+ up_write(&osdc->lock);
+
+ ret = linger_reg_commit_wait(lreq);
+ if (!ret)
+ ret = linger_notify_finish_wait(lreq);
+ else
+ dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+ linger_cancel(lreq);
+out_put_lreq:
+ linger_put(lreq);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error. If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
+{
+ unsigned long stamp, age;
+ int ret;
+
+ down_read(&osdc->lock);
+ mutex_lock(&lreq->lock);
+ stamp = lreq->watch_valid_thru;
+ if (!list_empty(&lreq->pending_lworks)) {
+ struct linger_work *lwork =
+ list_first_entry(&lreq->pending_lworks,
+ struct linger_work,
+ pending_item);
+
+ if (time_before(lwork->queued_stamp, stamp))
+ stamp = lwork->queued_stamp;
+ }
+ age = jiffies - stamp;
+ dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+ lreq, lreq->linger_id, age, lreq->last_error);
+ /* we are truncating to msecs, so return a safe upper bound */
+ ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+ mutex_unlock(&lreq->lock);
+ up_read(&osdc->lock);
+ return ret;
}
-EXPORT_SYMBOL(ceph_osdc_sync);
/*
* Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3868,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
}
EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+ down_read(&osdc->lock);
+ maybe_request_map(osdc);
+ up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
/*
* init, shutdown
@@ -2656,43 +3885,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
dout("init\n");
osdc->client = client;
- osdc->osdmap = NULL;
- init_rwsem(&osdc->map_sem);
- init_completion(&osdc->map_waiters);
- osdc->last_requested_map = 0;
- mutex_init(&osdc->request_mutex);
- osdc->last_tid = 0;
+ init_rwsem(&osdc->lock);
osdc->osds = RB_ROOT;
INIT_LIST_HEAD(&osdc->osd_lru);
- osdc->requests = RB_ROOT;
- INIT_LIST_HEAD(&osdc->req_lru);
- INIT_LIST_HEAD(&osdc->req_unsent);
- INIT_LIST_HEAD(&osdc->req_notarget);
- INIT_LIST_HEAD(&osdc->req_linger);
- osdc->num_requests = 0;
+ spin_lock_init(&osdc->osd_lru_lock);
+ osd_init(&osdc->homeless_osd);
+ osdc->homeless_osd.o_osdc = osdc;
+ osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+ osdc->linger_requests = RB_ROOT;
+ osdc->map_checks = RB_ROOT;
+ osdc->linger_map_checks = RB_ROOT;
INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
- spin_lock_init(&osdc->event_lock);
- osdc->event_tree = RB_ROOT;
- osdc->event_count = 0;
-
- schedule_delayed_work(&osdc->osds_timeout_work,
- round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM;
+ osdc->osdmap = ceph_osdmap_alloc();
+ if (!osdc->osdmap)
+ goto out;
+
osdc->req_mempool = mempool_create_slab_pool(10,
ceph_osd_request_cache);
if (!osdc->req_mempool)
- goto out;
+ goto out_map;
err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
- OSD_OP_FRONT_LEN, 10, true,
- "osd_op");
+ PAGE_SIZE, 10, true, "osd_op");
if (err < 0)
goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
- OSD_OPREPLY_FRONT_LEN, 10, true,
- "osd_op_reply");
+ PAGE_SIZE, 10, true, "osd_op_reply");
if (err < 0)
goto out_msgpool;
@@ -2701,6 +3922,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->notify_wq)
goto out_msgpool_reply;
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
return 0;
out_msgpool_reply:
@@ -2709,6 +3935,8 @@ out_msgpool:
ceph_msgpool_destroy(&osdc->msgpool_op);
out_mempool:
mempool_destroy(osdc->req_mempool);
+out_map:
+ ceph_osdmap_destroy(osdc->osdmap);
out:
return err;
}
@@ -2719,11 +3947,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
destroy_workqueue(osdc->notify_wq);
cancel_delayed_work_sync(&osdc->timeout_work);
cancel_delayed_work_sync(&osdc->osds_timeout_work);
- if (osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = NULL;
+
+ down_write(&osdc->lock);
+ while (!RB_EMPTY_ROOT(&osdc->osds)) {
+ struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+ struct ceph_osd, o_node);
+ close_osd(osd);
}
- remove_all_osds(osdc);
+ up_write(&osdc->lock);
+ WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+ osd_cleanup(&osdc->homeless_osd);
+
+ WARN_ON(!list_empty(&osdc->osd_lru));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+ WARN_ON(atomic_read(&osdc->num_requests));
+ WARN_ON(atomic_read(&osdc->num_homeless));
+
+ ceph_osdmap_destroy(osdc->osdmap);
mempool_destroy(osdc->req_mempool);
ceph_msgpool_destroy(&osdc->msgpool_op);
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3994,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
return PTR_ERR(req);
/* it may be a short read due to an object boundary */
-
osd_req_op_extent_osd_data_pages(req, 0,
pages, *plen, page_align, false, false);
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
off, *plen, *plen, page_align);
- ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
rc = ceph_osdc_start_request(osdc, req, false);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4025,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
int rc = 0;
int page_align = off & ~PAGE_MASK;
- BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4038,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
false, false);
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
- ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+ req->r_mtime = *mtime;
rc = ceph_osdc_start_request(osdc, req, true);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4078,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
+ struct ceph_osd_client *osdc = osd->o_osdc;
int type = le16_to_cpu(msg->hdr.type);
- if (!osd)
- goto out;
- osdc = osd->o_osdc;
-
switch (type) {
case CEPH_MSG_OSD_MAP:
ceph_osdc_handle_map(osdc, msg);
break;
case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg);
+ handle_reply(osd, msg);
break;
case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg);
@@ -2863,7 +4096,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
pr_err("received unknown message type %d %s\n", type,
ceph_msg_type_name(type));
}
-out:
+
ceph_msg_put(msg);
}
@@ -2878,21 +4111,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
{
struct ceph_osd *osd = con->private;
struct ceph_osd_client *osdc = osd->o_osdc;
- struct ceph_msg *m;
+ struct ceph_msg *m = NULL;
struct ceph_osd_request *req;
int front_len = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
- u64 tid;
+ u64 tid = le64_to_cpu(hdr->tid);
- tid = le64_to_cpu(hdr->tid);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+ *skip = 1;
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
if (!req) {
dout("%s osd%d tid %llu unknown, skipping\n", __func__,
osd->o_osd, tid);
- m = NULL;
*skip = 1;
- goto out;
+ goto out_unlock_session;
}
ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4143,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
false);
if (!m)
- goto out;
+ goto out_unlock_session;
ceph_msg_put(req->r_reply);
req->r_reply = m;
}
@@ -2915,14 +4154,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply->data_length);
m = NULL;
*skip = 1;
- goto out;
+ goto out_unlock_session;
}
m = ceph_msg_get(req->r_reply);
dout("get_reply tid %lld %p\n", tid, m);
-out:
- mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return m;
+}
+
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+ struct ceph_msg *m;
+ int type = le16_to_cpu(hdr->type);
+ u32 front_len = le32_to_cpu(hdr->front_len);
+ u32 data_len = le32_to_cpu(hdr->data_len);
+
+ m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+ if (!m)
+ return NULL;
+
+ if (data_len) {
+ struct page **pages;
+ struct ceph_osd_data osd_data;
+
+ pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+ GFP_NOIO);
+ if (!pages) {
+ ceph_msg_put(m);
+ return NULL;
+ }
+
+ ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+ false);
+ ceph_osdc_msg_data_add(m, &osd_data);
+ }
+
return m;
}
@@ -2932,18 +4206,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
{
struct ceph_osd *osd = con->private;
int type = le16_to_cpu(hdr->type);
- int front = le32_to_cpu(hdr->front_len);
*skip = 0;
switch (type) {
case CEPH_MSG_OSD_MAP:
case CEPH_MSG_WATCH_NOTIFY:
- return ceph_msg_new(type, front, GFP_NOFS, false);
+ return alloc_msg_with_page_vector(hdr);
case CEPH_MSG_OSD_OPREPLY:
return get_reply(con, hdr, skip);
default:
- pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
- osd->o_osd);
+ pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+ osd->o_osd, type);
*skip = 1;
return NULL;
}
@@ -3047,5 +4320,5 @@ static const struct ceph_connection_operations osd_con_ops = {
.alloc_msg = alloc_msg,
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
- .fault = osd_reset,
+ .fault = osd_fault,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8c..7e480bf75 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
return ERR_PTR(err);
}
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
{
- if (l.pool < r.pool)
+ if (lhs->pool < rhs->pool)
return -1;
- if (l.pool > r.pool)
+ if (lhs->pool > rhs->pool)
return 1;
- if (l.seed < r.seed)
+ if (lhs->seed < rhs->seed)
return -1;
- if (l.seed > r.seed)
+ if (lhs->seed > rhs->seed)
return 1;
+
return 0;
}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
static int __insert_pg_mapping(struct ceph_pg_mapping *new,
struct rb_root *root)
{
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
while (*p) {
parent = *p;
pg = rb_entry(parent, struct ceph_pg_mapping, node);
- c = pgid_cmp(new->pgid, pg->pgid);
+ c = ceph_pg_compare(&new->pgid, &pg->pgid);
if (c < 0)
p = &(*p)->rb_left;
else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
while (n) {
pg = rb_entry(n, struct ceph_pg_mapping, node);
- c = pgid_cmp(pgid, pg->pgid);
+ c = ceph_pg_compare(&pgid, &pg->pgid);
if (c < 0) {
n = n->rb_left;
} else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += 4; /* skip crash_replay_interval */
if (ev >= 7)
- *p += 1; /* skip min_size */
+ pi->min_size = ceph_decode_8(p);
+ else
+ pi->min_size = pi->size - pi->size / 2;
if (ev >= 8)
*p += 8 + 8; /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
pi->write_tier = -1;
}
+ if (ev >= 10) {
+ /* skip properties */
+ num = ceph_decode_32(p);
+ while (num--) {
+ len = ceph_decode_32(p);
+ *p += len; /* key */
+ len = ceph_decode_32(p);
+ *p += len; /* val */
+ }
+ }
+
+ if (ev >= 11) {
+ /* skip hit_set_params */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+
+ *p += 4; /* skip hit_set_period */
+ *p += 4; /* skip hit_set_count */
+ }
+
+ if (ev >= 12)
+ *p += 4; /* skip stripe_width */
+
+ if (ev >= 13) {
+ *p += 8; /* skip target_max_bytes */
+ *p += 8; /* skip target_max_objects */
+ *p += 4; /* skip cache_target_dirty_ratio_micro */
+ *p += 4; /* skip cache_target_full_ratio_micro */
+ *p += 4; /* skip cache_min_flush_age */
+ *p += 4; /* skip cache_min_evict_age */
+ }
+
+ if (ev >= 14) {
+ /* skip erasure_code_profile */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ if (ev >= 15)
+ pi->last_force_request_resend = ceph_decode_32(p);
+ else
+ pi->last_force_request_resend = 0;
+
/* ignore the rest */
*p = pool_end;
@@ -660,6 +707,23 @@ bad:
/*
* osd map
*/
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+ struct ceph_osdmap *map;
+
+ map = kzalloc(sizeof(*map), GFP_NOIO);
+ if (!map)
+ return NULL;
+
+ map->pg_pools = RB_ROOT;
+ map->pool_max = -1;
+ map->pg_temp = RB_ROOT;
+ map->primary_temp = RB_ROOT;
+ mutex_init(&map->crush_scratch_mutex);
+
+ return map;
+}
+
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
struct ceph_osdmap *map;
int ret;
- map = kzalloc(sizeof(*map), GFP_NOFS);
+ map = ceph_osdmap_alloc();
if (!map)
return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
- map->primary_temp = RB_ROOT;
- mutex_init(&map->crush_scratch_mutex);
-
ret = osdmap_decode(p, end, map);
if (ret) {
ceph_osdmap_destroy(map);
@@ -1201,11 +1261,119 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
}
/*
+ * Encoding order is (new_up_client, new_state, new_weight). Need to
+ * apply in the (new_weight, new_state, new_up_client) order, because
+ * an incremental map may look like e.g.
+ *
+ * new_up_client: { osd=6, addr=... } # set osd_state and addr
+ * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
+ */
+static int decode_new_up_state_weight(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ void *new_up_client;
+ void *new_state;
+ void *new_weight_end;
+ u32 len;
+
+ new_up_client = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ new_state = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(u8);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ s32 osd;
+ u32 w;
+
+ ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
+ osd = ceph_decode_32(p);
+ w = ceph_decode_32(p);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d weight 0x%x %s\n", osd, w,
+ w == CEPH_OSD_IN ? "(in)" :
+ (w == CEPH_OSD_OUT ? "(out)" : ""));
+ map->osd_weight[osd] = w;
+
+ /*
+ * If we are marking in, set the EXISTS, and clear the
+ * AUTOOUT and NEW bits.
+ */
+ if (w) {
+ map->osd_state[osd] |= CEPH_OSD_EXISTS;
+ map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
+ CEPH_OSD_NEW);
+ }
+ }
+ new_weight_end = *p;
+
+ /* new_state (up/down) */
+ *p = new_state;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ u8 xorstate;
+ int ret;
+
+ osd = ceph_decode_32(p);
+ xorstate = ceph_decode_8(p);
+ if (xorstate == 0)
+ xorstate = CEPH_OSD_UP;
+ BUG_ON(osd >= map->max_osd);
+ if ((map->osd_state[osd] & CEPH_OSD_UP) &&
+ (xorstate & CEPH_OSD_UP))
+ pr_info("osd%d down\n", osd);
+ if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
+ (xorstate & CEPH_OSD_EXISTS)) {
+ pr_info("osd%d does not exist\n", osd);
+ map->osd_weight[osd] = CEPH_OSD_IN;
+ ret = set_primary_affinity(map, osd,
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+ if (ret)
+ return ret;
+ memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
+ map->osd_state[osd] = 0;
+ } else {
+ map->osd_state[osd] ^= xorstate;
+ }
+ }
+
+ /* new_up_client */
+ *p = new_up_client;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ struct ceph_entity_addr addr;
+
+ osd = ceph_decode_32(p);
+ ceph_decode_copy(p, &addr, sizeof(addr));
+ ceph_decode_addr(&addr);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d up\n", osd);
+ map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ *p = new_weight_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr)
+ struct ceph_osdmap *map)
{
struct crush_map *newcrush = NULL;
struct ceph_fsid fsid;
@@ -1299,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
__remove_pg_pool(&map->pg_pools, pi);
}
- /* new_up */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, e_inval);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
- ceph_decode_addr(&addr);
- pr_info("osd%d up\n", osd);
- BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
- map->osd_addr[osd] = addr;
- }
-
- /* new_state */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- u8 xorstate;
- ceph_decode_32_safe(p, end, osd, e_inval);
- xorstate = **(u8 **)p;
- (*p)++; /* clean flag */
- if (xorstate == 0)
- xorstate = CEPH_OSD_UP;
- if (xorstate & CEPH_OSD_UP)
- pr_info("osd%d down\n", osd);
- if (osd < map->max_osd)
- map->osd_state[osd] ^= xorstate;
- }
-
- /* new_weight */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
- osd = ceph_decode_32(p);
- off = ceph_decode_32(p);
- pr_info("osd%d weight 0x%x %s\n", osd, off,
- off == CEPH_OSD_IN ? "(in)" :
- (off == CEPH_OSD_OUT ? "(out)" : ""));
- if (osd < map->max_osd)
- map->osd_weight[osd] = off;
- }
+ /* new_up_client, new_state, new_weight */
+ err = decode_new_up_state_weight(p, end, map);
+ if (err)
+ goto bad;
/* new_pg_temp */
err = decode_new_pg_temp(p, end, map);
@@ -1381,8 +1510,252 @@ bad:
return ERR_PTR(err);
}
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src)
+{
+ WARN_ON(!ceph_oid_empty(dest));
+
+ if (src->name != src->inline_name) {
+ /* very rare, see ceph_object_id definition */
+ dest->name = kmalloc(src->name_len + 1,
+ GFP_NOIO | __GFP_NOFAIL);
+ }
+ memcpy(dest->name, src->name, src->name_len + 1);
+ dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+ int len;
+
+ WARN_ON(!ceph_oid_empty(oid));
+
+ len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+ if (len >= sizeof(oid->inline_name))
+ return len;
+
+ oid->name_len = len;
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ BUG_ON(oid_printf_vargs(oid, fmt, ap));
+ va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, va_list ap)
+{
+ va_list aq;
+ int len;
+
+ va_copy(aq, ap);
+ len = oid_printf_vargs(oid, fmt, aq);
+ va_end(aq);
+
+ if (len) {
+ char *external_name;
+
+ external_name = kmalloc(len + 1, gfp);
+ if (!external_name)
+ return -ENOMEM;
+
+ oid->name = external_name;
+ WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+ oid->name_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+ if (oid->name != oid->inline_name)
+ kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (lhs->size == rhs->size &&
+ !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+ return true;
+
+ return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (__osds_equal(lhs, rhs) &&
+ lhs->primary == rhs->primary)
+ return true;
+
+ return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+ /* non-empty set */
+ if (set->size > 0 && set->primary >= 0)
+ return true;
+
+ /* empty can_shift_osds set */
+ if (!set->size && set->primary == -1)
+ return true;
+
+ /* empty !can_shift_osds set - all NONE */
+ if (set->size > 0 && set->primary == -1) {
+ int i;
+
+ for (i = 0; i < set->size; i++) {
+ if (set->osds[i] != CRUSH_ITEM_NONE)
+ break;
+ }
+ if (i == set->size)
+ return true;
+ }
+
+ return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+ memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+ dest->size = src->size;
+ dest->primary = src->primary;
+}
+
+static bool is_split(const struct ceph_pg *pgid,
+ u32 old_pg_num,
+ u32 new_pg_num)
+{
+ int old_bits = calc_bits_of(old_pg_num);
+ int old_mask = (1 << old_bits) - 1;
+ int n;
+
+ WARN_ON(pgid->seed >= old_pg_num);
+ if (new_pg_num <= old_pg_num)
+ return false;
+
+ for (n = 1; ; n++) {
+ int next_bit = n << (old_bits - 1);
+ u32 s = next_bit | pgid->seed;
+
+ if (s < old_pg_num || s == pgid->seed)
+ continue;
+ if (s >= new_pg_num)
+ break;
+
+ s = ceph_stable_mod(s, old_pg_num, old_mask);
+ if (s == pgid->seed)
+ return true;
+ }
+
+ return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ const struct ceph_pg *pgid)
+{
+ return !osds_equal(old_acting, new_acting) ||
+ !osds_equal(old_up, new_up) ||
+ old_size != new_size ||
+ old_min_size != new_min_size ||
+ is_split(pgid, old_pg_num, new_pg_num) ||
+ old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+ int i;
+
+ for (i = 0; i < acting->size; i++) {
+ if (acting->osds[i] == osd)
+ return i;
+ }
+
+ return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting)
+{
+ if (!old_acting->size && !new_acting->size)
+ return false; /* both still empty */
+
+ if (!old_acting->size ^ !new_acting->size)
+ return true; /* was empty, now not, or vice versa */
+
+ if (old_acting->primary != new_acting->primary)
+ return true; /* primary changed */
+
+ if (calc_pg_rank(old_acting->primary, old_acting) !=
+ calc_pg_rank(new_acting->primary, new_acting))
+ return true;
+
+ return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change)
+{
+ if (primary_changed(old_acting, new_acting))
+ return true;
+
+ if (any_change && !__osds_equal(old_acting, new_acting))
+ return true;
+
+ return false;
+}
/*
* calculate file layout from given offset, length.
@@ -1455,30 +1828,71 @@ invalid:
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/*
- * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
*/
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
- struct ceph_object_locator *oloc,
- struct ceph_object_id *oid,
- struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
- pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+ pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
- return -EIO;
+ return -ENOENT;
- pg_out->pool = oloc->pool;
- pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
- dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
- pg_out->pool, pg_out->seed);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
return 0;
}
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_pg *pgid)
+{
+ pgid->pool = raw_pgid->pool;
+ pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+ pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed). Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid)
+{
+ if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+ /* hash pool id and seed so that pool PGs do not overlap */
+ return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(raw_pgid->seed,
+ pi->pgp_num,
+ pi->pgp_num_mask),
+ raw_pgid->pool);
+ } else {
+ /*
+ * legacy behavior: add ps and pool together. this is
+ * not a great approach because the PGs from each pool
+ * will overlap on top of each other: 0.5 == 1.4 ==
+ * 2.3 == ...
+ */
+ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+ pi->pgp_num_mask) +
+ (unsigned)raw_pgid->pool;
+ }
+}
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
@@ -1497,84 +1911,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
}
/*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG. The result may
+ * contain nonexistent OSDs. ->primary is undefined for a raw set.
*
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
*/
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *raw,
+ u32 *ppps)
{
+ u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno;
int len;
- /* crush */
- ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
- pool->type, pool->size);
+ ceph_osds_init(raw);
+ if (ppps)
+ *ppps = pps;
+
+ ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+ pi->size);
if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
- pgid.pool, pool->crush_ruleset, pool->type,
- pool->size);
- return -ENOENT;
+ pi->id, pi->crush_ruleset, pi->type, pi->size);
+ return;
}
- len = do_crush(osdmap, ruleno, pps, osds,
- min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+ len = do_crush(osdmap, ruleno, pps, raw->osds,
+ min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
osdmap->osd_weight, osdmap->max_osd);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
- len, ruleno, pgid.pool, pool->crush_ruleset,
- pool->type, pool->size);
- return len;
+ len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+ pi->size);
+ return;
}
- return len;
+ raw->size = len;
}
/*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary. By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
*
- * Return up set length. *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set. If it's
+ * empty, ->primary will remain undefined.
*/
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
{
- int up_primary = -1;
int i;
- if (ceph_can_shift_osds(pool)) {
+ /* ->primary is undefined for a raw set */
+ BUG_ON(set->primary != -1);
+
+ if (ceph_can_shift_osds(pi)) {
int removed = 0;
- for (i = 0; i < len; i++) {
- if (ceph_osd_is_down(osdmap, osds[i])) {
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
- osds[i - removed] = osds[i];
+ set->osds[i - removed] = set->osds[i];
}
-
- len -= removed;
- if (len > 0)
- up_primary = osds[0];
+ set->size -= removed;
+ if (set->size > 0)
+ set->primary = set->osds[0];
} else {
- for (i = len - 1; i >= 0; i--) {
- if (ceph_osd_is_down(osdmap, osds[i]))
- osds[i] = CRUSH_ITEM_NONE;
+ /* set down/dne devices to NONE */
+ for (i = set->size - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
else
- up_primary = osds[i];
+ set->primary = set->osds[i];
}
}
-
- *primary = up_primary;
- return len;
}
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ u32 pps,
+ struct ceph_osds *up)
{
int i;
int pos = -1;
@@ -1586,8 +2008,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (!osdmap->osd_primary_affinity)
return;
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +2017,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
break;
}
}
- if (i == len)
+ if (i == up->size)
return;
/*
@@ -1603,8 +2025,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
* osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary.
*/
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
u32 aff;
if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +2051,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (pos < 0)
return;
- *primary = osds[pos];
+ up->primary = up->osds[pos];
- if (ceph_can_shift_osds(pool) && pos > 0) {
+ if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */
for (i = pos; i > 0; i--)
- osds[i] = osds[i - 1];
- osds[0] = *primary;
+ up->osds[i] = up->osds[i - 1];
+ up->osds[0] = up->primary;
}
}
/*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
*
- * Return acting set length. *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings. This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
*/
-static int apply_temps(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
- int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *temp)
{
+ struct ceph_pg pgid;
struct ceph_pg_mapping *pg;
- int temp_len;
- int temp_primary;
int i;
- /* raw_pg -> pg */
- pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
- pool->pg_num_mask);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+ ceph_osds_init(temp);
/* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
- temp_len = 0;
- temp_primary = -1;
-
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
- if (ceph_can_shift_osds(pool))
+ if (ceph_can_shift_osds(pi))
continue;
- else
- osds[temp_len++] = CRUSH_ITEM_NONE;
+
+ temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else {
- osds[temp_len++] = pg->pg_temp.osds[i];
+ temp->osds[temp->size++] = pg->pg_temp.osds[i];
}
}
/* apply pg_temp's primary */
- for (i = 0; i < temp_len; i++) {
- if (osds[i] != CRUSH_ITEM_NONE) {
- temp_primary = osds[i];
+ for (i = 0; i < temp->size; i++) {
+ if (temp->osds[i] != CRUSH_ITEM_NONE) {
+ temp->primary = temp->osds[i];
break;
}
}
- } else {
- temp_len = len;
- temp_primary = *primary;
}
/* primary_temp? */
pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
- temp_primary = pg->primary_temp.osd;
-
- *primary = temp_primary;
- return temp_len;
+ temp->primary = pg->primary_temp.osd;
}
/*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
*
- * Return acting set length, or error. *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
*/
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting)
{
- struct ceph_pg_pool_info *pool;
+ struct ceph_pg_pool_info *pi;
u32 pps;
- int len;
- pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
- if (!pool) {
- *primary = -1;
- return -ENOENT;
+ pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+ if (!pi) {
+ ceph_osds_init(up);
+ ceph_osds_init(acting);
+ goto out;
}
- if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
- /* hash pool id and seed so that pool PGs do not overlap */
- pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
- ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask),
- pgid.pool);
- } else {
- /*
- * legacy behavior: add ps and pool together. this is
- * not a great approach because the PGs from each pool
- * will overlap on top of each other: 0.5 == 1.4 ==
- * 2.3 == ...
- */
- pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask) +
- (unsigned)pgid.pool;
+ pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+ raw_to_up_osds(osdmap, pi, up);
+ apply_primary_affinity(osdmap, pi, pps, up);
+ get_temp_osds(osdmap, pi, raw_pgid, acting);
+ if (!acting->size) {
+ memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+ acting->size = up->size;
+ if (acting->primary == -1)
+ acting->primary = up->primary;
}
-
- len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
- if (len < 0) {
- *primary = -1;
- return len;
- }
-
- len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
- apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
- len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
- return len;
+out:
+ WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
/*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
*/
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid)
{
- int osds[CEPH_PG_MAX_SIZE];
- int primary;
-
- ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+ struct ceph_osds up, acting;
- return primary;
+ ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+ return acting.primary;
}
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
diff --git a/net/compat.c b/net/compat.c
index 5cfd26a00..1cd2ec046 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -309,8 +309,8 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
__scm_destroy(scm);
}
-static int do_set_attach_filter(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */
+struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval)
{
struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
@@ -323,6 +323,19 @@ static int do_set_attach_filter(struct socket *sock, int level, int optname,
__get_user(ptr, &fprog32->filter) ||
__put_user(len, &kfprog->len) ||
__put_user(compat_ptr(ptr), &kfprog->filter))
+ return NULL;
+
+ return kfprog;
+}
+EXPORT_SYMBOL_GPL(get_compat_bpf_fprog);
+
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock_fprog __user *kfprog;
+
+ kfprog = get_compat_bpf_fprog(optval);
+ if (!kfprog)
return -EFAULT;
return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
@@ -354,7 +367,8 @@ static int do_set_sock_timeout(struct socket *sock, int level,
static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
- if (optname == SO_ATTACH_FILTER)
+ if (optname == SO_ATTACH_FILTER ||
+ optname == SO_ATTACH_REUSEPORT_CBPF)
return do_set_attach_filter(sock, level, optname,
optval, optlen);
if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fa9dc6450..b7de71f8d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_free_datagram);
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
{
bool slow;
if (likely(atomic_read(&skb->users) == 1))
smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users)))
+ else if (likely(!atomic_dec_and_test(&skb->users))) {
+ sk_peek_offset_bwd(sk, len);
return;
+ }
slow = lock_sock_fast(sk);
+ sk_peek_offset_bwd(sk, len);
skb_orphan(skb);
sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);
@@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
/* skb is now orphaned, can be freed outside of locked section */
__kfree_skb(skb);
}
-EXPORT_SYMBOL(skb_free_datagram_locked);
+EXPORT_SYMBOL(__skb_free_datagram_locked);
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c925ac50..904ff431d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
__net_timestamp(SKB); \
} \
-bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
+bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
unsigned int len;
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
* taps currently in use.
*/
-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ out_unlock:
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/**
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
@@ -2711,6 +2712,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
return ERR_PTR(err);
}
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
@@ -2825,14 +2839,45 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb,
return vlan_features_check(skb, features);
}
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+ if (gso_segs > dev->gso_max_segs)
+ return features & ~NETIF_F_GSO_MASK;
+
+ /* Support for GSO partial features requires software
+ * intervention before we can actually process the packets
+ * so we need to strip support for any partial features now
+ * and we can pull them back in after we have partially
+ * segmented the frame.
+ */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+ features &= ~dev->gso_partial_features;
+
+ /* Make sure to clear the IPv4 ID mangling feature if the
+ * IPv4 header has the potential to be fragmented.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ struct iphdr *iph = skb->encapsulation ?
+ inner_ip_hdr(skb) : ip_hdr(skb);
+
+ if (!(iph->frag_off & htons(IP_DF)))
+ features &= ~NETIF_F_TSO_MANGLEID;
+ }
+
+ return features;
+}
+
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
- u16 gso_segs = skb_shinfo(skb)->gso_segs;
- if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
- features &= ~NETIF_F_GSO_MASK;
+ if (skb_is_gso(skb))
+ features = gso_features_check(skb, dev, features);
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
@@ -2915,9 +2960,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
{
netdev_features_t features;
- if (skb->next)
- return skb;
-
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -2960,6 +3002,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
out_kfree_skb:
kfree_skb(skb);
out_null:
+ atomic_long_inc(&dev->tx_dropped);
return NULL;
}
@@ -3143,12 +3186,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
case TC_ACT_SHOT:
qdisc_qstats_cpu_drop(cl->q);
*ret = NET_XMIT_DROP;
- goto drop;
+ kfree_skb(skb);
+ return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*ret = NET_XMIT_SUCCESS;
-drop:
- kfree_skb(skb);
+ consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
@@ -3349,7 +3392,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb = validate_xmit_skb(skb, dev);
if (!skb)
- goto drop;
+ goto out;
HARD_TX_LOCK(dev, txq, cpu);
@@ -3376,7 +3419,6 @@ recursion_alert:
}
rc = -ENETDOWN;
-drop:
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
@@ -3428,6 +3470,7 @@ u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
+EXPORT_SYMBOL(rps_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3914,9 +3957,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
break;
case TC_ACT_SHOT:
qdisc_qstats_cpu_drop(cl->q);
+ kfree_skb(skb);
+ return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
- kfree_skb(skb);
+ consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
@@ -4440,6 +4485,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
+ NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
@@ -4664,6 +4710,8 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
napi_reuse_skb(napi, skb);
return NULL;
}
@@ -4938,8 +4986,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
netpoll_poll_unlock(have);
}
if (rc > 0)
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+ __NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_BUSYPOLLRXPACKETS, rc);
local_bh_enable();
if (rc == LL_FLUSH_FAILED)
@@ -6676,6 +6724,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_TSO6;
}
+ /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
+ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
+ features &= ~NETIF_F_TSO_MANGLEID;
+
/* TSO ECN requires that TSO is present as well. */
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
@@ -6704,6 +6756,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ /* GSO partial features require GSO partial be set */
+ if ((features & dev->gso_partial_features) &&
+ !(features & NETIF_F_GSO_PARTIAL)) {
+ netdev_dbg(dev,
+ "Dropping partially supported GSO features since no GSO partial.\n");
+ features &= ~dev->gso_partial_features;
+ }
+
#ifdef CONFIG_NET_RX_BUSY_POLL
if (dev->netdev_ops->ndo_busy_poll)
features |= NETIF_F_BUSY_POLL;
@@ -6974,9 +7034,22 @@ int register_netdevice(struct net_device *dev)
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
- if (!(dev->flags & IFF_LOOPBACK)) {
+ if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
- }
+
+ /* If IPv4 TCP segmentation offload is supported we should also
+ * allow the device to enable segmenting the frame with the option
+ * of ignoring a static IP ID value. This doesn't enable the
+ * feature itself but allows the user to enable it later.
+ */
+ if (dev->hw_features & NETIF_F_TSO)
+ dev->hw_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->vlan_features & NETIF_F_TSO)
+ dev->vlan_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->mpls_features & NETIF_F_TSO)
+ dev->mpls_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->hw_enc_features & NETIF_F_TSO)
+ dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
@@ -6984,7 +7057,7 @@ int register_netdevice(struct net_device *dev)
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
- dev->hw_enc_features |= NETIF_F_SG;
+ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
@@ -7427,7 +7500,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
- dev->gso_min_segs = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 590fa561c..933e8d4d3 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -119,7 +119,171 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
return devlink_port_get_from_attrs(devlink, info->attrs);
}
-#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
+struct devlink_sb {
+ struct list_head list;
+ unsigned int index;
+ u32 size;
+ u16 ingress_pools_count;
+ u16 egress_pools_count;
+ u16 ingress_tc_count;
+ u16 egress_tc_count;
+};
+
+static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
+{
+ return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
+}
+
+static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (devlink_sb->index == sb_index)
+ return devlink_sb;
+ }
+ return NULL;
+}
+
+static bool devlink_sb_index_exists(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ return devlink_sb_get_by_index(devlink, sb_index);
+}
+
+static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_SB_INDEX]) {
+ u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ if (!devlink_sb)
+ return ERR_PTR(-ENODEV);
+ return devlink_sb;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_sb_get_from_attrs(devlink, info->attrs);
+}
+
+static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ u16 *p_pool_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
+ if (val >= devlink_sb_pool_count(devlink_sb))
+ return -EINVAL;
+ *p_pool_index = val;
+ return 0;
+}
+
+static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ u16 *p_pool_index)
+{
+ return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
+ p_pool_index);
+}
+
+static int
+devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
+ if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val != DEVLINK_SB_POOL_TYPE_EGRESS)
+ return -EINVAL;
+ *p_pool_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_pool_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
+}
+
+static int
+devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
+ if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
+ val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
+ return -EINVAL;
+ *p_th_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_th_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
+}
+
+static int
+devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
+ if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val >= devlink_sb->ingress_tc_count)
+ return -EINVAL;
+ if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
+ val >= devlink_sb->egress_tc_count)
+ return -EINVAL;
+ *p_tc_index = val;
+ return 0;
+}
+
+static int
+devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
+ pool_type, p_tc_index);
+}
+
+#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
+#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3)
+ /* port is not needed but we need to ensure they don't
+ * change in the middle of command
+ */
static int devlink_nl_pre_doit(const struct genl_ops *ops,
struct sk_buff *skb, struct genl_info *info)
@@ -132,8 +296,9 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
mutex_unlock(&devlink_mutex);
return PTR_ERR(devlink);
}
- info->user_ptr[0] = devlink;
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
+ info->user_ptr[0] = devlink;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
struct devlink_port *devlink_port;
mutex_lock(&devlink_port_mutex);
@@ -143,7 +308,22 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
mutex_unlock(&devlink_mutex);
return PTR_ERR(devlink_port);
}
- info->user_ptr[1] = devlink_port;
+ info->user_ptr[0] = devlink_port;
+ }
+ if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) {
+ mutex_lock(&devlink_port_mutex);
+ }
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb)) {
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink_sb);
+ }
+ info->user_ptr[1] = devlink_sb;
}
return 0;
}
@@ -151,7 +331,8 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
static void devlink_nl_post_doit(const struct genl_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT ||
+ ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS)
mutex_unlock(&devlink_port_mutex);
mutex_unlock(&devlink_mutex);
}
@@ -356,8 +537,8 @@ out:
static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
struct sk_buff *msg;
int err;
@@ -436,8 +617,8 @@ static int devlink_port_type_set(struct devlink *devlink,
static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
int err;
if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
@@ -497,12 +678,735 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
return devlink_port_unsplit(devlink, port_index);
}
+static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
+ devlink_sb->ingress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
+ devlink_sb->egress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
+ devlink_sb->ingress_tc_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
+ devlink_sb->egress_tc_count))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index, enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_sb_pool_info pool_info;
+ void *hdr;
+ int err;
+
+ err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
+ pool_index, &pool_info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ pool_info.threshold_type))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_pool_fill(msg, devlink,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ portid, seq, NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ return 0;
+}
+
+static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_pool_get)
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
+ devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type)
+
+{
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_pool_set)
+ return ops->sb_pool_set(devlink, sb_index, pool_index,
+ size, threshold_type);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ enum devlink_sb_threshold_type threshold_type;
+ u16 pool_index;
+ u32 size;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_th_type_get_from_info(info, &threshold_type);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE])
+ return -EINVAL;
+
+ size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
+ return devlink_sb_pool_set(devlink, devlink_sb->index,
+ pool_index, size, threshold_type);
+}
+
+static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_port_pool_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
+ devlink_sb, pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ struct devlink_port *devlink_port;
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_port_pool_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_port_pool_get)
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_port_pool_get_dumpit(msg, start, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops && ops->sb_port_pool_set)
+ return ops->sb_port_pool_set(devlink_port, sb_index,
+ pool_index, threshold);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
+ pool_index, threshold);
+}
+
+static int
+devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u16 pool_index;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ &pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_tc_port_bind_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_tc_port_bind_get(devlink_port,
+ devlink_sb->index,
+ tc_index, pool_type,
+ &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct sk_buff *msg;
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ int err;
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
+ devlink_sb, tc_index, pool_type,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ info->snd_portid,
+ info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq)
+{
+ struct devlink_port *devlink_port;
+ u16 tc_index;
+ int err;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ for (tc_index = 0;
+ tc_index < devlink_sb->ingress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_INGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ for (tc_index = 0;
+ tc_index < devlink_sb->egress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_EGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ NLM_F_MULTI);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int
+devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_sb *devlink_sb;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
+ !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
+ continue;
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
+ devlink,
+ devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops && ops->sb_tc_pool_bind_set)
+ return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ pool_index, threshold);
+}
+
+static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_occ_snapshot)
+ return ops->sb_occ_snapshot(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb = info->user_ptr[1];
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops && ops->sb_occ_max_clear)
+ return ops->sb_occ_max_clear(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
[DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -511,6 +1415,7 @@ static const struct genl_ops devlink_nl_ops[] = {
.doit = devlink_nl_cmd_get_doit,
.dumpit = devlink_nl_cmd_get_dumpit,
.policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
@@ -533,12 +1438,92 @@ static const struct genl_ops devlink_nl_ops[] = {
.doit = devlink_nl_cmd_port_split_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_PORT_UNSPLIT,
.doit = devlink_nl_cmd_port_unsplit_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .doit = devlink_nl_cmd_sb_get_doit,
+ .dumpit = devlink_nl_cmd_sb_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .doit = devlink_nl_cmd_sb_pool_get_doit,
+ .dumpit = devlink_nl_cmd_sb_pool_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_SET,
+ .doit = devlink_nl_cmd_sb_pool_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .doit = devlink_nl_cmd_sb_port_pool_get_doit,
+ .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
+ .doit = devlink_nl_cmd_sb_port_pool_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit,
+ .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
+ .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
+ DEVLINK_NL_FLAG_NEED_SB,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
+ .doit = devlink_nl_cmd_sb_occ_snapshot_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB |
+ DEVLINK_NL_FLAG_LOCK_PORTS,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
+ .doit = devlink_nl_cmd_sb_occ_max_clear_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NEED_SB |
+ DEVLINK_NL_FLAG_LOCK_PORTS,
},
};
@@ -561,6 +1546,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
devlink->ops = ops;
devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
+ INIT_LIST_HEAD(&devlink->sb_list);
return devlink;
}
EXPORT_SYMBOL_GPL(devlink_alloc);
@@ -630,7 +1616,6 @@ int devlink_port_register(struct devlink *devlink,
}
devlink_port->devlink = devlink;
devlink_port->index = port_index;
- devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
devlink_port->registered = true;
list_add_tail(&devlink_port->list, &devlink->port_list);
mutex_unlock(&devlink_port_mutex);
@@ -717,6 +1702,51 @@ void devlink_port_split_set(struct devlink_port *devlink_port,
}
EXPORT_SYMBOL_GPL(devlink_port_split_set);
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ struct devlink_sb *devlink_sb;
+ int err = 0;
+
+ mutex_lock(&devlink_mutex);
+ if (devlink_sb_index_exists(devlink, sb_index)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
+ if (!devlink_sb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ devlink_sb->index = sb_index;
+ devlink_sb->size = size;
+ devlink_sb->ingress_pools_count = ingress_pools_count;
+ devlink_sb->egress_pools_count = egress_pools_count;
+ devlink_sb->ingress_tc_count = ingress_tc_count;
+ devlink_sb->egress_tc_count = egress_tc_count;
+ list_add_tail(&devlink_sb->list, &devlink->sb_list);
+unlock:
+ mutex_unlock(&devlink_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_sb_register);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ mutex_lock(&devlink_mutex);
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ WARN_ON(!devlink_sb);
+ list_del(&devlink_sb->list);
+ mutex_unlock(&devlink_mutex);
+ kfree(devlink_sb);
+}
+EXPORT_SYMBOL_GPL(devlink_sb_unregister);
+
static int __init devlink_module_init(void)
{
return genl_register_family_with_ops_groups(&devlink_nl_family,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f426c5ad6..f4034817d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -79,12 +79,16 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
[NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
[NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
- [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
- [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
+ [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
+ [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -387,15 +391,17 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
return 0;
}
-static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+ u32 legacy_u32)
{
bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
dst[0] = legacy_u32;
}
+EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
/* return false if src had higher bits set. lower bits always updated. */
-static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
- const unsigned long *src)
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+ const unsigned long *src)
{
bool retval = true;
@@ -415,6 +421,7 @@ static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
*legacy_u32 = src[0];
return retval;
}
+EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
/* return false if legacy contained non-0 deprecated fields
* transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
@@ -437,13 +444,13 @@ convert_legacy_settings_to_link_ksettings(
legacy_settings->maxrxpkt)
retval = false;
- convert_legacy_u32_to_link_mode(
+ ethtool_convert_legacy_u32_to_link_mode(
link_ksettings->link_modes.supported,
legacy_settings->supported);
- convert_legacy_u32_to_link_mode(
+ ethtool_convert_legacy_u32_to_link_mode(
link_ksettings->link_modes.advertising,
legacy_settings->advertising);
- convert_legacy_u32_to_link_mode(
+ ethtool_convert_legacy_u32_to_link_mode(
link_ksettings->link_modes.lp_advertising,
legacy_settings->lp_advertising);
link_ksettings->base.speed
@@ -482,13 +489,13 @@ convert_link_ksettings_to_legacy_settings(
* __u32 maxrxpkt;
*/
- retval &= convert_link_mode_to_legacy_u32(
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
&legacy_settings->supported,
link_ksettings->link_modes.supported);
- retval &= convert_link_mode_to_legacy_u32(
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
&legacy_settings->advertising,
link_ksettings->link_modes.advertising);
- retval &= convert_link_mode_to_legacy_u32(
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
&legacy_settings->lp_advertising,
link_ksettings->link_modes.lp_advertising);
ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 365de6643..840acebbb 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -549,7 +549,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8); /* FRA_TUN_ID */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -607,7 +607,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->target &&
nla_put_u32(skb, FRA_GOTO, rule->target)) ||
(rule->tun_id &&
- nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index ca7f832b2..e759d90e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -53,9 +53,10 @@
#include <net/sock_reuseport.h>
/**
- * sk_filter - run a packet through a socket filter
+ * sk_filter_trim_cap - run a packet through a socket filter
* @sk: sock associated with &sk_buff
* @skb: buffer to filter
+ * @cap: limit on how short the eBPF program may trim the packet
*
* Run the eBPF program and then cut skb->data to correct size returned by
* the program. If pkt_len is 0 we toss packet. If skb->len is smaller
@@ -64,7 +65,7 @@
* be accepted or -EPERM if the packet should be tossed.
*
*/
-int sk_filter(struct sock *sk, struct sk_buff *skb)
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
{
int err;
struct sk_filter *filter;
@@ -85,14 +86,13 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
filter = rcu_dereference(sk->sk_filter);
if (filter) {
unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
-
- err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+ err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
}
rcu_read_unlock();
return err;
}
-EXPORT_SYMBOL(sk_filter);
+EXPORT_SYMBOL(sk_filter_trim_cap);
static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
{
@@ -994,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
*/
goto out_err_free;
- bpf_prog_select_runtime(fp);
+ /* We are guaranteed to never error here with cBPF to eBPF
+ * transitions, since there's no issue with type compatibility
+ * checks on program arrays.
+ */
+ fp = bpf_prog_select_runtime(fp, &err);
kfree(old_prog);
return fp;
@@ -1149,8 +1153,7 @@ void bpf_prog_destroy(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);
-static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
- bool locked)
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
struct sk_filter *fp, *old_fp;
@@ -1166,8 +1169,10 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
return -ENOMEM;
}
- old_fp = rcu_dereference_protected(sk->sk_filter, locked);
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
rcu_assign_pointer(sk->sk_filter, fp);
+
if (old_fp)
sk_filter_uncharge(sk, old_fp);
@@ -1246,8 +1251,7 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
* occurs or there is insufficient memory for the filter a negative
* errno code is returned. On success the return is zero.
*/
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
- bool locked)
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
@@ -1255,7 +1259,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __sk_attach_prog(prog, sk, locked);
+ err = __sk_attach_prog(prog, sk);
if (err < 0) {
__bpf_prog_release(prog);
return err;
@@ -1263,12 +1267,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
return 0;
}
-EXPORT_SYMBOL_GPL(__sk_attach_filter);
-
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
-{
- return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
@@ -1314,7 +1313,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk));
+ err = __sk_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
return err;
@@ -1349,6 +1348,21 @@ struct bpf_scratchpad {
static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
+static inline int bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ int err;
+
+ if (!skb_cloned(skb))
+ return 0;
+ if (skb_clone_writable(skb, write_len))
+ return 0;
+ err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (!err)
+ bpf_compute_data_end(skb);
+ return err;
+}
+
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
@@ -1371,7 +1385,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
*/
if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
return -EFAULT;
- if (unlikely(skb_try_make_writable(skb, offset + len)))
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, len, sp->buff);
@@ -1414,16 +1428,19 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
unsigned int len = (unsigned int) r4;
void *ptr;
- if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK))
- return -EFAULT;
+ if (unlikely((u32) offset > 0xffff))
+ goto err_clear;
ptr = skb_header_pointer(skb, offset, len, to);
if (unlikely(!ptr))
- return -EFAULT;
+ goto err_clear;
if (ptr != to)
memcpy(to, ptr, len);
return 0;
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
}
static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
@@ -1432,7 +1449,7 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_PTR_TO_RAW_STACK,
.arg4_type = ARG_CONST_STACK_SIZE,
};
@@ -1446,7 +1463,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
- if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1501,7 +1518,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
- if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1701,12 +1718,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
__be16 vlan_proto = (__force __be16) r2;
+ int ret;
if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
vlan_proto != htons(ETH_P_8021AD)))
vlan_proto = htons(ETH_P_8021Q);
- return skb_vlan_push(skb, vlan_proto, vlan_tci);
+ ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+ bpf_compute_data_end(skb);
+ return ret;
}
const struct bpf_func_proto bpf_skb_vlan_push_proto = {
@@ -1722,8 +1742,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ int ret;
- return skb_vlan_pop(skb);
+ ret = skb_vlan_pop(skb);
+ bpf_compute_data_end(skb);
+ return ret;
}
const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
@@ -1761,12 +1784,19 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
u8 compat[sizeof(struct bpf_tunnel_key)];
+ void *to_orig = to;
+ int err;
- if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6))))
- return -EINVAL;
- if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags))
- return -EPROTO;
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ err = -EINVAL;
+ goto err_clear;
+ }
+ if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
+ err = -EPROTO;
+ goto err_clear;
+ }
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ err = -EINVAL;
switch (size) {
case offsetof(struct bpf_tunnel_key, tunnel_label):
case offsetof(struct bpf_tunnel_key, tunnel_ext):
@@ -1776,12 +1806,12 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
* a common path later on.
*/
if (ip_tunnel_info_af(info) != AF_INET)
- return -EINVAL;
+ goto err_clear;
set_compat:
to = (struct bpf_tunnel_key *)compat;
break;
default:
- return -EINVAL;
+ goto err_clear;
}
}
@@ -1798,9 +1828,12 @@ set_compat:
}
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
- memcpy((void *)(long) r2, to, size);
+ memcpy(to_orig, to, size);
return 0;
+err_clear:
+ memset(to_orig, 0, size);
+ return err;
}
static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
@@ -1808,7 +1841,7 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_PTR_TO_RAW_STACK,
.arg3_type = ARG_CONST_STACK_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -1818,16 +1851,26 @@ static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
struct sk_buff *skb = (struct sk_buff *) (long) r1;
u8 *to = (u8 *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ int err;
if (unlikely(!info ||
- !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
- return -ENOENT;
- if (unlikely(size < info->options_len))
- return -ENOMEM;
+ !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+ err = -ENOENT;
+ goto err_clear;
+ }
+ if (unlikely(size < info->options_len)) {
+ err = -ENOMEM;
+ goto err_clear;
+ }
ip_tunnel_info_opts_get(to, info);
+ if (size > info->options_len)
+ memset(to + info->options_len, 0, size - info->options_len);
return info->options_len;
+err_clear:
+ memset(to, 0, size);
+ return err;
}
static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
@@ -1835,7 +1878,7 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_PTR_TO_RAW_STACK,
.arg3_type = ARG_CONST_STACK_SIZE,
};
@@ -2021,6 +2064,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
+ case BPF_FUNC_perf_event_output:
+ return bpf_get_event_output_proto();
default:
return sk_filter_func_proto(func_id);
}
@@ -2028,31 +2073,32 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
-
- /* disallow misaligned access */
+ /* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
-
- /* all __sk_buff fields are __u32 */
- if (size != 4)
+ if (size != sizeof(__u32))
return false;
return true;
}
static bool sk_filter_is_valid_access(int off, int size,
- enum bpf_access_type type)
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
- if (off == offsetof(struct __sk_buff, tc_classid))
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ case offsetof(struct __sk_buff, data):
+ case offsetof(struct __sk_buff, data_end):
return false;
+ }
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
break;
default:
return false;
@@ -2063,7 +2109,8 @@ static bool sk_filter_is_valid_access(int off, int size,
}
static bool tc_cls_act_is_valid_access(int off, int size,
- enum bpf_access_type type)
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
if (type == BPF_WRITE) {
switch (off) {
@@ -2078,6 +2125,16 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return false;
}
}
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
return __is_valid_access(off, size, type);
}
@@ -2195,6 +2252,20 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
break;
+ case offsetof(struct __sk_buff, data):
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, data));
+ break;
+
+ case offsetof(struct __sk_buff, data_end):
+ ctx_off -= offsetof(struct __sk_buff, data_end);
+ ctx_off += offsetof(struct sk_buff, cb);
+ ctx_off += offsetof(struct bpf_skb_data_end, data_end);
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
+ dst_reg, src_reg, ctx_off);
+ break;
+
case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
@@ -2219,30 +2290,30 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
}
static const struct bpf_verifier_ops sk_filter_ops = {
- .get_func_proto = sk_filter_func_proto,
- .is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
- .get_func_proto = tc_cls_act_func_proto,
- .is_valid_access = tc_cls_act_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .get_func_proto = tc_cls_act_func_proto,
+ .is_valid_access = tc_cls_act_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
- .ops = &sk_filter_ops,
- .type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .ops = &sk_filter_ops,
+ .type = BPF_PROG_TYPE_SOCKET_FILTER,
};
static struct bpf_prog_type_list sched_cls_type __read_mostly = {
- .ops = &tc_cls_act_ops,
- .type = BPF_PROG_TYPE_SCHED_CLS,
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_CLS,
};
static struct bpf_prog_type_list sched_act_type __read_mostly = {
- .ops = &tc_cls_act_ops,
- .type = BPF_PROG_TYPE_SCHED_ACT,
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_ACT,
};
static int __init register_sk_filter_ops(void)
@@ -2255,7 +2326,7 @@ static int __init register_sk_filter_ops(void)
}
late_initcall(register_sk_filter_ops);
-int __sk_detach_filter(struct sock *sk, bool locked)
+int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
struct sk_filter *filter;
@@ -2263,7 +2334,8 @@ int __sk_detach_filter(struct sock *sk, bool locked)
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
- filter = rcu_dereference_protected(sk->sk_filter, locked);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
if (filter) {
RCU_INIT_POINTER(sk->sk_filter, NULL);
sk_filter_uncharge(sk, filter);
@@ -2272,12 +2344,7 @@ int __sk_detach_filter(struct sock *sk, bool locked)
return ret;
}
-EXPORT_SYMBOL_GPL(__sk_detach_filter);
-
-int sk_detach_filter(struct sock *sk)
-{
- return __sk_detach_filter(sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
unsigned int len)
@@ -2288,7 +2355,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (!filter)
goto out;
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index e640462ea..be873e4e3 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -25,9 +25,9 @@
static inline int
-gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
{
- if (nla_put(d->skb, type, size, buf))
+ if (nla_put_64bit(d->skb, type, size, buf, padattr))
goto nla_put_failure;
return 0;
@@ -47,6 +47,7 @@ nla_put_failure:
* @xstats_type: TLV type for backward compatibility xstats TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
@@ -59,7 +60,8 @@ nla_put_failure:
*/
int
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
- int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+ int xstats_type, spinlock_t *lock,
+ struct gnet_dump *d, int padattr)
__acquires(lock)
{
memset(d, 0, sizeof(*d));
@@ -71,20 +73,22 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
+ d->padattr = padattr;
if (d->tail)
- return gnet_stats_copy(d, type, NULL, 0);
+ return gnet_stats_copy(d, type, NULL, 0, padattr);
return 0;
}
EXPORT_SYMBOL(gnet_stats_start_copy_compat);
/**
- * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * gnet_stats_start_copy - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
@@ -94,9 +98,9 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat);
*/
int
gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
- struct gnet_dump *d)
+ struct gnet_dump *d, int padattr)
{
- return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
+ return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr);
}
EXPORT_SYMBOL(gnet_stats_start_copy);
@@ -169,7 +173,8 @@ gnet_stats_copy_basic(struct gnet_dump *d,
memset(&sb, 0, sizeof(sb));
sb.bytes = bstats.bytes;
sb.packets = bstats.packets;
- return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
+ return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
+ TCA_STATS_PAD);
}
return 0;
}
@@ -208,11 +213,13 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
}
if (d->tail) {
- res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
+ TCA_STATS_PAD);
if (res < 0 || est.bps == r->bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
+ TCA_STATS_PAD);
}
return 0;
@@ -286,7 +293,8 @@ gnet_stats_copy_queue(struct gnet_dump *d,
if (d->tail)
return gnet_stats_copy(d, TCA_STATS_QUEUE,
- &qstats, sizeof(qstats));
+ &qstats, sizeof(qstats),
+ TCA_STATS_PAD);
return 0;
}
@@ -316,7 +324,8 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
}
if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_APP, st, len);
+ return gnet_stats_copy(d, TCA_STATS_APP, st, len,
+ TCA_STATS_PAD);
return 0;
@@ -347,12 +356,12 @@ gnet_stats_finish_copy(struct gnet_dump *d)
if (d->compat_tc_stats)
if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
- sizeof(d->tc_stats)) < 0)
+ sizeof(d->tc_stats), d->padattr) < 0)
return -1;
if (d->compat_xstats && d->xstats) {
if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
- d->xstats_len) < 0)
+ d->xstats_len, d->padattr) < 0)
return -1;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 769cece9b..510cd62fc 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1763,21 +1763,22 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NEIGH_VAR(parms, MCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_REPROBES,
NEIGH_VAR(parms, MCAST_REPROBES)) ||
- nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
+ NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
- NEIGH_VAR(parms, BASE_REACHABLE_TIME)) ||
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_GC_STALETIME,
- NEIGH_VAR(parms, GC_STALETIME)) ||
+ NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
- NEIGH_VAR(parms, DELAY_PROBE_TIME)) ||
+ NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_RETRANS_TIME,
- NEIGH_VAR(parms, RETRANS_TIME)) ||
+ NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
- NEIGH_VAR(parms, ANYCAST_DELAY)) ||
+ NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_PROXY_DELAY,
- NEIGH_VAR(parms, PROXY_DELAY)) ||
+ NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_LOCKTIME,
- NEIGH_VAR(parms, LOCKTIME)))
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -1804,7 +1805,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
- nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) ||
nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
@@ -1856,7 +1857,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndst.ndts_table_fulls += st->table_fulls;
}
- if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
+ if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
+ NDTA_PAD))
goto nla_put_failure;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 2bf832996..14d09345f 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
- sd->cpu_collision, sd->received_rps, flow_limit_count);
+ 0, /* was cpu_collision */
+ sd->received_rps, flow_limit_count);
return 0;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2b3f76fe6..7a0b61655 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
#include <linux/jiffies.h>
#include <linux/pm_runtime.h>
#include <linux/of.h>
+#include <linux/of_net.h>
#include "net-sysfs.h"
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 20999aa59..8b02df0d3 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
- if (remaining <= 0) {
- pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
- return;
- }
+ if (remaining <= 0)
+ goto out;
start_time = ktime_get();
if (remaining < 100000) {
@@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
}
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ destroy_hrtimer_on_stack(&t.timer);
}
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
@@ -3472,7 +3472,6 @@ xmit_more:
pkt_dev->odevname, ret);
pkt_dev->errors++;
/* fallthru */
- case NETDEV_TX_LOCKED:
case NETDEV_TX_BUSY:
/* Retry it next time */
atomic_dec(&(pkt_dev->skb->users));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 65763c29f..d69c4644f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -808,11 +808,6 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
a->rx_nohandler = b->rx_nohandler;
}
-static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
-{
- memcpy(v, b, sizeof(*b));
-}
-
/* All VF info */
static inline int rtnl_vfinfo_size(const struct net_device *dev,
u32 ext_filter_mask)
@@ -830,17 +825,17 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
/* IFLA_VF_STATS_RX_PACKETS */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_PACKETS */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_RX_BYTES */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_BYTES */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_BROADCAST */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_MULTICAST */
- nla_total_size(sizeof(__u64)) +
+ nla_total_size_64bit(sizeof(__u64)) +
nla_total_size(sizeof(struct ifla_vf_trust)));
return size;
} else
@@ -881,9 +876,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
- + nla_total_size(sizeof(struct rtnl_link_ifmap))
+ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
+ nla_total_size(sizeof(struct rtnl_link_stats))
- + nla_total_size(sizeof(struct rtnl_link_stats64))
+ + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ nla_total_size(4) /* IFLA_TXQLEN */
@@ -1054,25 +1049,23 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
struct net_device *dev)
{
- const struct rtnl_link_stats64 *stats;
- struct rtnl_link_stats64 temp;
+ struct rtnl_link_stats64 *sp;
struct nlattr *attr;
- stats = dev_get_stats(dev, &temp);
-
- attr = nla_reserve(skb, IFLA_STATS,
- sizeof(struct rtnl_link_stats));
+ attr = nla_reserve_64bit(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64), IFLA_PAD);
if (!attr)
return -EMSGSIZE;
- copy_rtnl_link_stats(nla_data(attr), stats);
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
- attr = nla_reserve(skb, IFLA_STATS64,
- sizeof(struct rtnl_link_stats64));
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
if (!attr)
return -EMSGSIZE;
- copy_rtnl_link_stats64(nla_data(attr), stats);
+ copy_rtnl_link_stats(nla_data(attr), sp);
return 0;
}
@@ -1160,18 +1153,18 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_nest_cancel(skb, vfinfo);
return -EMSGSIZE;
}
- if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
- vf_stats.rx_packets) ||
- nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
- vf_stats.tx_packets) ||
- nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
- vf_stats.rx_bytes) ||
- nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
- vf_stats.tx_bytes) ||
- nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
- vf_stats.broadcast) ||
- nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast))
+ if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast, IFLA_VF_STATS_PAD))
return -EMSGSIZE;
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
@@ -1190,7 +1183,7 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
map.dma = dev->dma;
map.port = dev->if_port;
- if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
+ if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
return -EMSGSIZE;
return 0;
@@ -3453,6 +3446,202 @@ out:
return err;
}
+static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
+{
+ return (mask & IFLA_STATS_FILTER_BIT(attrid)) &&
+ (!idxattr || idxattr == attrid);
+}
+
+static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, unsigned int filter_mask,
+ int *idxattr, int *prividx)
+{
+ struct if_stats_msg *ifsm;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ int s_prividx = *prividx;
+
+ ASSERT_RTNL();
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifsm = nlmsg_data(nlh);
+ ifsm->ifindex = dev->ifindex;
+ ifsm->filter_mask = filter_mask;
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) {
+ struct rtnl_link_stats64 *sp;
+
+ attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
+ sizeof(struct rtnl_link_stats64),
+ IFLA_STATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+
+ if (ops && ops->fill_linkxstats) {
+ int err;
+
+ *idxattr = IFLA_STATS_LINK_XSTATS;
+ attr = nla_nest_start(skb,
+ IFLA_STATS_LINK_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = ops->fill_linkxstats(skb, dev, prividx);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
+
+ nlmsg_end(skb, nlh);
+
+ return 0;
+
+nla_put_failure:
+ /* not a multi message or no progress mean a real error */
+ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx)
+ nlmsg_cancel(skb, nlh);
+ else
+ nlmsg_end(skb, nlh);
+
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
+ [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) },
+};
+
+static size_t if_nlmsg_stats_size(const struct net_device *dev,
+ u32 filter_mask)
+{
+ size_t size = 0;
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
+ size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+
+ if (ops && ops->get_linkxstats_size) {
+ size += nla_total_size(ops->get_linkxstats_size(dev));
+ /* for IFLA_STATS_LINK_XSTATS */
+ size += nla_total_size(0);
+ }
+ }
+
+ return size;
+}
+
+static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ int idxattr = 0, prividx = 0;
+ struct if_stats_msg *ifsm;
+ struct sk_buff *nskb;
+ u32 filter_mask;
+ int err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ filter_mask = ifsm->filter_mask;
+ if (!filter_mask)
+ return -EINVAL;
+
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+ if (!nskb)
+ return -ENOBUFS;
+
+ err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
+ NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ 0, filter_mask, &idxattr, &prividx);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else {
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+ }
+
+ return err;
+}
+
+static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int h, s_h, err, s_idx, s_idxattr, s_prividx;
+ struct net *net = sock_net(skb->sk);
+ unsigned int flags = NLM_F_MULTI;
+ struct if_stats_msg *ifsm;
+ struct hlist_head *head;
+ struct net_device *dev;
+ u32 filter_mask = 0;
+ int idx = 0;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+ s_idxattr = cb->args[2];
+ s_prividx = cb->args[3];
+
+ cb->seq = net->dev_base_seq;
+
+ ifsm = nlmsg_data(cb->nlh);
+ filter_mask = ifsm->filter_mask;
+ if (!filter_mask)
+ return -EINVAL;
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, filter_mask,
+ &s_idxattr, &s_prividx);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err < 0)
+ goto out;
+ s_prividx = 0;
+ s_idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ cb->args[3] = s_prividx;
+ cb->args[2] = s_idxattr;
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
/* Process one rtnetlink message. */
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -3602,4 +3791,7 @@ void __init rtnetlink_init(void)
rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
+ NULL);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 59bf4d771..eb12d2161 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3058,11 +3058,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int partial_segs = 0;
unsigned int headroom;
- unsigned int len;
+ unsigned int len = head_skb->len;
__be16 proto;
- bool csum;
- int sg = !!(features & NETIF_F_SG);
+ bool csum, sg;
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
@@ -3074,8 +3074,21 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
+ sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
+ if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+ partial_segs = len / mss;
+ if (partial_segs > 1)
+ mss *= partial_segs;
+ else
+ partial_segs = 0;
+ }
+
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3263,6 +3276,23 @@ perform_csum_check:
*/
segs->prev = tail;
+ /* Update GSO info on first skb in partial sequence. */
+ if (partial_segs) {
+ int type = skb_shinfo(head_skb)->gso_type;
+
+ /* Update type to add partial and then remove dodgy if set */
+ type |= SKB_GSO_PARTIAL;
+ type &= ~SKB_GSO_DODGY;
+
+ /* Update GSO info and prepare to start updating headers on
+ * our way back down the stack of protocols.
+ */
+ skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
+ skb_shinfo(segs)->gso_segs = partial_segs;
+ skb_shinfo(segs)->gso_type = type;
+ SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ }
+
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
@@ -4577,3 +4607,239 @@ failure:
return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ const int headlen, gfp_t gfp_mask)
+{
+ int i;
+ int size = skb_end_offset(skb);
+ int new_hlen = headlen - off;
+ u8 *data;
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ /* Copy real data, and all frags */
+ skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+ skb->len -= off;
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_cloned(skb)) {
+ /* drop the old head gracefully */
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+ skb_release_data(skb);
+ } else {
+ /* we can reuse existing recount- all we did was
+ * relocate values
+ */
+ skb_free_head(skb);
+ }
+
+ skb->head = data;
+ skb->data = data;
+ skb->head_frag = 0;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb_set_tail_pointer(skb, skb_headlen(skb));
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct sk_buff *skb,
+ struct skb_shared_info *shinfo, int eat,
+ gfp_t gfp_mask)
+{
+ struct sk_buff *list = shinfo->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ if (!list) {
+ pr_err("Not enough bytes to eat. Want %d\n", eat);
+ return -EFAULT;
+ }
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+ if (skb_shared(list)) {
+ clone = skb_clone(list, gfp_mask);
+ if (!clone)
+ return -ENOMEM;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without problems. */
+ insp = list;
+ }
+ if (pskb_carve(list, eat, gfp_mask) < 0) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = shinfo->frag_list) != insp) {
+ shinfo->frag_list = list->next;
+ kfree_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ shinfo->frag_list = clone;
+ }
+ return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+ int pos, gfp_t gfp_mask)
+{
+ int i, k = 0;
+ int size = skb_end_offset(skb);
+ u8 *data;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+ struct skb_shared_info *shinfo;
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb), offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ shinfo = (struct skb_shared_info *)(data + size);
+ for (i = 0; i < nfrags; i++) {
+ int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (pos + fsize > off) {
+ shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < off) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ shinfo->frags[0].page_offset += off - pos;
+ skb_frag_size_sub(&shinfo->frags[0], off - pos);
+ }
+ skb_frag_ref(skb, i);
+ k++;
+ }
+ pos += fsize;
+ }
+ shinfo->nr_frags = k;
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ if (k == 0) {
+ /* split line is in frag list */
+ pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+ }
+ skb_release_data(skb);
+
+ skb->head = data;
+ skb->head_frag = 0;
+ skb->data = data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb_reset_tail_pointer(skb);
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ skb->len -= off;
+ skb->data_len = skb->len;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+ return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+ int headlen = skb_headlen(skb);
+
+ if (len < headlen)
+ return pskb_carve_inside_header(skb, len, headlen, gfp);
+ else
+ return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+ int to_copy, gfp_t gfp)
+{
+ struct sk_buff *clone = skb_clone(skb, gfp);
+
+ if (!clone)
+ return NULL;
+
+ if (pskb_carve(clone, off, gfp) < 0 ||
+ pskb_trim(clone, to_copy)) {
+ kfree_skb(clone);
+ return NULL;
+ }
+ return clone;
+}
+EXPORT_SYMBOL(pskb_extract);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7e73c26b6..25dab8b60 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -405,9 +405,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
}
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int err;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
@@ -417,10 +416,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return -ENOMEM;
}
- err = sk_filter(sk, skb);
- if (err)
- return err;
-
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
@@ -443,13 +438,26 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk->sk_data_ready(sk);
return 0;
}
+EXPORT_SYMBOL(__sock_queue_rcv_skb);
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = sk_filter(sk, skb);
+ if (err)
+ return err;
+
+ return __sock_queue_rcv_skb(sk, skb);
+}
EXPORT_SYMBOL(sock_queue_rcv_skb);
-int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
+ const int nested, unsigned int trim_cap)
{
int rc = NET_RX_SUCCESS;
- if (sk_filter(sk, skb))
+ if (sk_filter_trim_cap(sk, skb, trim_cap))
goto discard_and_relse;
skb->dev = NULL;
@@ -485,7 +493,7 @@ discard_and_relse:
kfree_skb(skb);
goto out;
}
-EXPORT_SYMBOL(sk_receive_skb);
+EXPORT_SYMBOL(__sk_receive_skb);
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
@@ -835,7 +843,8 @@ set_rcvbuf:
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
if (sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM) {
- if (sk->sk_state != TCP_ESTABLISHED) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN)) {
ret = -EINVAL;
break;
}
@@ -1421,8 +1430,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
}
EXPORT_SYMBOL(sk_alloc);
-void sk_destruct(struct sock *sk)
+/* Sockets having SOCK_RCU_FREE will call this function after one RCU
+ * grace period. This is the case for UDP sockets and TCP listeners.
+ */
+static void __sk_destruct(struct rcu_head *head)
{
+ struct sock *sk = container_of(head, struct sock, sk_rcu);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -1451,6 +1464,14 @@ void sk_destruct(struct sock *sk)
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_destruct(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_RCU_FREE))
+ call_rcu(&sk->sk_rcu, __sk_destruct);
+ else
+ __sk_destruct(&sk->sk_rcu);
+}
+
static void __sk_free(struct sock *sk)
{
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
@@ -1515,6 +1536,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_dst_cache = NULL;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
+ atomic_set(&newsk->sk_drops, 0);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
@@ -1634,6 +1656,17 @@ void sock_wfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_wfree);
+/* This variant of sock_wfree() is used by TCP,
+ * since it sets SOCK_USE_WRITE_QUEUE.
+ */
+void __sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
@@ -1656,8 +1689,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL(skb_set_owner_w);
+/* This helper is used by netem, as it can hold packets in its
+ * delay queue. We want to allow the owner socket to send more
+ * packets, as if they were already TX completed by a typical driver.
+ * But we also want to keep skb->sk set because some packet schedulers
+ * rely on it (sch_fq for example). So we set skb->truesize to a small
+ * amount (1) and decrease sk_wmem_alloc accordingly.
+ */
void skb_orphan_partial(struct sk_buff *skb)
{
+ /* If this skb is a TCP pure ACK or already went here,
+ * we have nothing to do. 2 is already a very small truesize.
+ */
+ if (skb->truesize <= 2)
+ return;
+
/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
* so we do not completely orphan skb, but transfert all
* accounted bytes but one, to avoid unexpected reorders.
@@ -1869,27 +1915,55 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+ struct sockcm_cookie *sockc)
+{
+ u32 tsflags;
+
+ switch (cmsg->cmsg_type) {
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SO_TIMESTAMPING:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+
+ tsflags = *(u32 *)CMSG_DATA(cmsg);
+ if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
+ return -EINVAL;
+
+ sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
+ sockc->tsflags |= tsflags;
+ break;
+ /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
+ case SCM_RIGHTS:
+ case SCM_CREDENTIALS:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__sock_cmsg_send);
+
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
struct sockcm_cookie *sockc)
{
struct cmsghdr *cmsg;
+ int ret;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- switch (cmsg->cmsg_type) {
- case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
- return -EINVAL;
- sockc->mark = *(u32 *)CMSG_DATA(cmsg);
- break;
- default:
- return -EINVAL;
- }
+ ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -1974,33 +2048,27 @@ static void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
- struct sk_buff *skb = sk->sk_backlog.head;
+ struct sk_buff *skb, *next;
- do {
+ while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
- bh_unlock_sock(sk);
- do {
- struct sk_buff *next = skb->next;
+ spin_unlock_bh(&sk->sk_lock.slock);
+ do {
+ next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
- /*
- * We are in process context here with softirqs
- * disabled, use cond_resched_softirq() to preempt.
- * This is safe to do because we've taken the backlog
- * queue private:
- */
- cond_resched_softirq();
+ cond_resched();
skb = next;
} while (skb != NULL);
- bh_lock_sock(sk);
- } while ((skb = sk->sk_backlog.head) != NULL);
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
/*
* Doing the zeroing here guarantee we can not loop forever
@@ -2009,6 +2077,13 @@ static void __release_sock(struct sock *sk)
sk->sk_backlog.len = 0;
}
+void __sk_flush_backlog(struct sock *sk)
+{
+ spin_lock_bh(&sk->sk_lock.slock);
+ __release_sock(sk);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
@@ -2145,6 +2220,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
}
EXPORT_SYMBOL(__sk_mem_reclaim);
+int sk_set_peek_off(struct sock *sk, int val)
+{
+ if (val < 0)
+ return -EINVAL;
+
+ sk->sk_peek_off = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
/*
* Set of default routines for initialising struct proto_ops when
@@ -2432,11 +2516,6 @@ EXPORT_SYMBOL(lock_sock_nested);
void release_sock(struct sock *sk)
{
- /*
- * The sk_lock has mutex_unlock() semantics:
- */
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a996ce8c8..6b10573cc 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -67,6 +67,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
return nla_put(skb, attrtype, sizeof(mem), &mem);
}
@@ -119,7 +120,7 @@ static size_t sock_diag_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+ nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
- + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+ + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
static void sock_diag_broadcast_destroy_work(struct work_struct *work)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a6beb7b6a..0df2aa652 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+# ifdef CONFIG_HAVE_EBPF_JIT
+ {
+ .procname = "bpf_jit_harden",
+ .data = &bpf_jit_harden,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec,
+ },
+# endif
#endif
{
.procname = "netdev_tstamp_prequeue",
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index b0e28d24e..0c55ffb85 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -198,9 +198,9 @@ struct dccp_mib {
};
DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
-#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
-#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
-#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
+#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
/*
* Checksumming routines
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3bd14e885..ba347184b 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -359,7 +359,7 @@ send_sync:
goto discard;
}
- DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+ DCCP_INC_STATS(DCCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9c67a961b..345a3aeb8 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -62,7 +62,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = daddr = usin->sin_addr.s_addr;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt != NULL && inet_opt->opt.srr) {
if (daddr == 0)
return -EINVAL;
@@ -205,7 +205,7 @@ void dccp_req_err(struct sock *sk, u64 seq)
* socket here.
*/
if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else {
/*
* Still in RESPOND, just remove it silently.
@@ -247,7 +247,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
if (skb->len < offset + sizeof(*dh) ||
skb->len < offset + __dccp_basic_hdr_len(dh)) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
}
@@ -256,7 +256,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
iph->saddr, ntohs(dh->dccph_sport),
inet_iif(skb));
if (!sk) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
}
@@ -273,7 +273,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
* servers this needs to be solved differently.
*/
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == DCCP_CLOSED)
goto out;
@@ -281,7 +281,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
dp = dccp_sk(sk);
if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
!between48(seq, dp->dccps_awl, dp->dccps_awh)) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -318,7 +318,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
case DCCP_REQUESTING:
case DCCP_RESPOND:
if (!sock_owned_by_user(sk)) {
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
sk->sk_err = err;
sk->sk_error_report(sk);
@@ -431,11 +431,11 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
return newsk;
exit_overflow:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
put_and_exit:
inet_csk_prepare_forced_close(newsk);
@@ -462,7 +462,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
@@ -527,17 +527,19 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
rxiph->daddr);
skb_dst_set(skb, dst_clone(dst));
+ local_bh_disable();
bh_lock_sock(ctl_sk);
err = ip_build_and_send_pkt(skb, ctl_sk,
rxiph->daddr, rxiph->saddr, NULL);
bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ __DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ __DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
}
+ local_bh_enable();
out:
- dst_release(dst);
+ dst_release(dst);
}
static void dccp_v4_reqsk_destructor(struct request_sock *req)
@@ -637,7 +639,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
drop_and_free:
reqsk_free(req);
drop:
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
return -1;
}
EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
@@ -764,6 +766,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
const struct iphdr *iph;
+ bool refcounted;
struct sock *sk;
int min_cov;
@@ -801,7 +804,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
lookup:
sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
- dh->dccph_sport, dh->dccph_dport);
+ dh->dccph_sport, dh->dccph_dport, &refcounted);
if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
@@ -830,6 +833,7 @@ lookup:
goto lookup;
}
sock_hold(sk);
+ refcounted = true;
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
@@ -864,7 +868,7 @@ lookup:
goto discard_and_relse;
nf_reset(skb);
- return sk_receive_skb(sk, skb, 1);
+ return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4);
no_dccp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -886,7 +890,8 @@ discard_it:
return 0;
discard_and_relse:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
}
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4663a01d5..3ff137d94 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -80,8 +80,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (skb->len < offset + sizeof(*dh) ||
skb->len < offset + __dccp_basic_hdr_len(dh)) {
- ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
return;
}
@@ -91,8 +91,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
inet6_iif(skb));
if (!sk) {
- ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
return;
}
@@ -106,7 +106,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == DCCP_CLOSED)
goto out;
@@ -114,7 +114,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
dp = dccp_sk(sk);
if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
!between48(seq, dp->dccps_awl, dp->dccps_awh)) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -156,7 +156,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
case DCCP_RESPOND: /* Cannot happen.
It can, it SYNs are crossed. --ANK */
if (!sock_owned_by_user(sk)) {
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
sk->sk_err = err;
/*
* Wake people up to see the error
@@ -277,8 +277,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
}
@@ -378,7 +378,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
drop_and_free:
reqsk_free(req);
drop:
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
return -1;
}
@@ -527,11 +527,11 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
return newsk;
out_overflow:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
out_nonewsk:
dst_release(dst);
out:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
}
@@ -642,6 +642,7 @@ discard:
static int dccp_v6_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
+ bool refcounted;
struct sock *sk;
int min_cov;
@@ -670,7 +671,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
lookup:
sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
dh->dccph_sport, dh->dccph_dport,
- inet6_iif(skb));
+ inet6_iif(skb), &refcounted);
if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
@@ -699,6 +700,7 @@ lookup:
goto lookup;
}
sock_hold(sk);
+ refcounted = true;
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
@@ -730,7 +732,7 @@ lookup:
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- return sk_receive_skb(sk, skb, 1) ? -1 : 0;
+ return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0;
no_dccp_socket:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -752,7 +754,8 @@ discard_it:
return 0;
discard_and_relse:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
}
@@ -865,7 +868,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.fl6_sport = inet->inet_sport;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 1994f8af6..53eddf99e 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -127,7 +127,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk,
}
dccp_init_xmit_timers(newsk);
- DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+ __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS);
}
return newsk;
}
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 9bce31886..74d29c56c 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -253,7 +253,7 @@ out_nonsensical_length:
return 0;
out_invalid_option:
- DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+ DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
rc = DCCP_RESET_CODE_OPTION_ERROR;
out_featneg_failed:
DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 3ef7acef3..3a2c34027 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -28,7 +28,7 @@ static void dccp_write_err(struct sock *sk)
dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
dccp_done(sk);
- DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+ __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT);
}
/* A write timeout has occurred. Process the after effects. */
@@ -100,7 +100,7 @@ static void dccp_retransmit_timer(struct sock *sk)
* total number of retransmissions of clones of original packets.
*/
if (icsk->icsk_retransmits == 0)
- DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+ __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS);
if (dccp_retransmit_skb(sk) != 0) {
/*
@@ -179,7 +179,7 @@ static void dccp_delack_timer(unsigned long data)
if (sock_owned_by_user(sk)) {
/* Try again later. */
icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
sk_reset_timer(sk, &icsk->icsk_delack_timer,
jiffies + TCP_DELACK_MIN);
goto out;
@@ -209,7 +209,7 @@ static void dccp_delack_timer(unsigned long data)
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
dccp_send_ack(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
out:
bh_unlock_sock(sk);
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index df4803437..a796fc7cb 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -41,6 +41,7 @@
#include <net/dn_fib.h>
#include <net/dn_neigh.h>
#include <net/dn_dev.h>
+#include <net/nexthop.h>
#define RT_MIN_TABLE 1
@@ -150,14 +151,13 @@ static int dn_fib_count_nhs(const struct nlattr *attr)
struct rtnexthop *nhp = nla_data(attr);
int nhs = 0, nhlen = nla_len(attr);
- while(nhlen >= (int)sizeof(struct rtnexthop)) {
- if ((nhlen -= nhp->rtnh_len) < 0)
- return 0;
+ while (rtnh_ok(nhp, nhlen)) {
nhs++;
- nhp = RTNH_NEXT(nhp);
+ nhp = rtnh_next(nhp, &nhlen);
}
- return nhs;
+ /* leftover implies invalid nexthop configuration, discard it */
+ return nhlen > 0 ? 0 : nhs;
}
static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
@@ -167,21 +167,24 @@ static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
int nhlen = nla_len(attr);
change_nexthops(fi) {
- int attrlen = nhlen - sizeof(struct rtnexthop);
- if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ int attrlen;
+
+ if (!rtnh_ok(nhp, nhlen))
return -EINVAL;
nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
nh->nh_oif = nhp->rtnh_ifindex;
nh->nh_weight = nhp->rtnh_hops + 1;
- if (attrlen) {
+ attrlen = rtnh_attrlen(nhp);
+ if (attrlen > 0) {
struct nlattr *gw_attr;
gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0;
}
- nhp = RTNH_NEXT(nhp);
+
+ nhp = rtnh_next(nhp, &nhlen);
} endfor_nexthops(fi);
return 0;
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index c79b85eb4..8737412c7 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -281,7 +281,7 @@ static int __init init_dns_resolver(void)
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index c28c47463..eff5dfc2e 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -51,11 +51,12 @@ void unregister_switch_driver(struct dsa_switch_driver *drv)
EXPORT_SYMBOL_GPL(unregister_switch_driver);
static struct dsa_switch_driver *
-dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name)
+dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
+ const char **_name, void **priv)
{
struct dsa_switch_driver *ret;
struct list_head *list;
- char *name;
+ const char *name;
ret = NULL;
name = NULL;
@@ -66,7 +67,7 @@ dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name)
drv = list_entry(list, struct dsa_switch_driver, list);
- name = drv->probe(host_dev, sw_addr);
+ name = drv->probe(parent, host_dev, sw_addr, priv);
if (name != NULL) {
ret = drv;
break;
@@ -181,7 +182,7 @@ __ATTRIBUTE_GROUPS(dsa_hwmon);
/* basic switch operations **************************************************/
static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master)
{
- struct dsa_chip_data *cd = ds->pd;
+ struct dsa_chip_data *cd = ds->cd;
struct device_node *port_dn;
struct phy_device *phydev;
int ret, port, mode;
@@ -218,7 +219,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
{
struct dsa_switch_driver *drv = ds->drv;
struct dsa_switch_tree *dst = ds->dst;
- struct dsa_chip_data *pd = ds->pd;
+ struct dsa_chip_data *cd = ds->cd;
bool valid_name_found = false;
int index = ds->index;
int i, ret;
@@ -229,7 +230,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
for (i = 0; i < DSA_MAX_PORTS; i++) {
char *name;
- name = pd->port_names[i];
+ name = cd->port_names[i];
if (name == NULL)
continue;
@@ -245,7 +246,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
} else if (!strcmp(name, "dsa")) {
ds->dsa_port_mask |= 1 << i;
} else {
- ds->phys_port_mask |= 1 << i;
+ ds->enabled_port_mask |= 1 << i;
}
valid_name_found = true;
}
@@ -258,7 +259,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
/* Make the built-in MII bus mask match the number of ports,
* switch drivers can override this later
*/
- ds->phys_mii_mask = ds->phys_port_mask;
+ ds->phys_mii_mask = ds->enabled_port_mask;
/*
* If the CPU connects to this switch, set the switch tree
@@ -266,7 +267,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
* switch.
*/
if (dst->cpu_switch == index) {
- switch (ds->tag_protocol) {
+ switch (drv->tag_protocol) {
#ifdef CONFIG_NET_DSA_TAG_DSA
case DSA_TAG_PROTO_DSA:
dst->rcv = dsa_netdev_ops.rcv;
@@ -294,7 +295,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
goto out;
}
- dst->tag_protocol = ds->tag_protocol;
+ dst->tag_protocol = drv->tag_protocol;
}
/*
@@ -324,13 +325,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
* Create network devices for physical switch ports.
*/
for (i = 0; i < DSA_MAX_PORTS; i++) {
- if (!(ds->phys_port_mask & (1 << i)))
+ if (!(ds->enabled_port_mask & (1 << i)))
continue;
- ret = dsa_slave_create(ds, parent, i, pd->port_names[i]);
+ ret = dsa_slave_create(ds, parent, i, cd->port_names[i]);
if (ret < 0) {
netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
- index, i, pd->port_names[i], ret);
+ index, i, cd->port_names[i], ret);
ret = 0;
}
}
@@ -378,16 +379,17 @@ static struct dsa_switch *
dsa_switch_setup(struct dsa_switch_tree *dst, int index,
struct device *parent, struct device *host_dev)
{
- struct dsa_chip_data *pd = dst->pd->chip + index;
+ struct dsa_chip_data *cd = dst->pd->chip + index;
struct dsa_switch_driver *drv;
struct dsa_switch *ds;
int ret;
- char *name;
+ const char *name;
+ void *priv;
/*
* Probe for switch model.
*/
- drv = dsa_switch_probe(host_dev, pd->sw_addr, &name);
+ drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv);
if (drv == NULL) {
netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n",
index);
@@ -400,16 +402,16 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
/*
* Allocate and initialise switch state.
*/
- ds = devm_kzalloc(parent, sizeof(*ds) + drv->priv_size, GFP_KERNEL);
+ ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL);
if (ds == NULL)
return ERR_PTR(-ENOMEM);
ds->dst = dst;
ds->index = index;
- ds->pd = pd;
+ ds->cd = cd;
ds->drv = drv;
- ds->tag_protocol = drv->tag_protocol;
- ds->master_dev = host_dev;
+ ds->priv = priv;
+ ds->dev = parent;
ret = dsa_switch_setup_one(ds, parent);
if (ret)
@@ -422,7 +424,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
{
struct device_node *port_dn;
struct phy_device *phydev;
- struct dsa_chip_data *cd = ds->pd;
+ struct dsa_chip_data *cd = ds->cd;
int port;
#ifdef CONFIG_NET_DSA_HWMON
@@ -432,7 +434,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
/* Destroy network devices for physical switch ports. */
for (port = 0; port < DSA_MAX_PORTS; port++) {
- if (!(ds->phys_port_mask & (1 << port)))
+ if (!(ds->enabled_port_mask & (1 << port)))
continue;
if (!ds->ports[port])
@@ -657,9 +659,6 @@ static int dsa_of_probe(struct device *dev)
const char *port_name;
int chip_index, port_index;
const unsigned int *sw_addr, *port_reg;
- int gpio;
- enum of_gpio_flags of_flags;
- unsigned long flags;
u32 eeprom_len;
int ret;
@@ -738,19 +737,6 @@ static int dsa_of_probe(struct device *dev)
put_device(cd->host_dev);
cd->host_dev = &mdio_bus_switch->dev;
}
- gpio = of_get_named_gpio_flags(child, "reset-gpios", 0,
- &of_flags);
- if (gpio_is_valid(gpio)) {
- flags = (of_flags == OF_GPIO_ACTIVE_LOW ?
- GPIOF_ACTIVE_LOW : 0);
- ret = devm_gpio_request_one(dev, gpio, flags,
- "switch_reset");
- if (ret)
- goto out_free_chip;
-
- cd->reset = gpio_to_desc(gpio);
- gpiod_direction_output(cd->reset, 0);
- }
for_each_available_child_of_node(child, port) {
port_reg = of_get_property(port, "reg", NULL);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 1d1a54687..dfa33779d 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -22,11 +22,6 @@ struct dsa_device_ops {
};
struct dsa_slave_priv {
- /*
- * The linux network interface corresponding to this
- * switch port.
- */
- struct net_device *dev;
struct sk_buff * (*xmit)(struct sk_buff *skb,
struct net_device *dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a575f0350..152436cda 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -50,8 +50,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds)
ds->slave_mii_bus->read = dsa_slave_phy_read;
ds->slave_mii_bus->write = dsa_slave_phy_write;
snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x",
- ds->index, ds->pd->sw_addr);
- ds->slave_mii_bus->parent = ds->master_dev;
+ ds->index, ds->cd->sw_addr);
+ ds->slave_mii_bus->parent = ds->dev;
ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
}
@@ -104,8 +104,8 @@ static int dsa_slave_open(struct net_device *dev)
goto clear_promisc;
}
- if (ds->drv->port_stp_update)
- ds->drv->port_stp_update(ds, p->port, stp_state);
+ if (ds->drv->port_stp_state_set)
+ ds->drv->port_stp_state_set(ds, p->port, stp_state);
if (p->phy)
phy_start(p->phy);
@@ -147,8 +147,8 @@ static int dsa_slave_close(struct net_device *dev)
if (ds->drv->port_disable)
ds->drv->port_disable(ds, p->port, p->phy);
- if (ds->drv->port_stp_update)
- ds->drv->port_stp_update(ds, p->port, BR_STATE_DISABLED);
+ if (ds->drv->port_stp_state_set)
+ ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED);
return 0;
}
@@ -207,21 +207,16 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- int err;
if (switchdev_trans_ph_prepare(trans)) {
if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add)
return -EOPNOTSUPP;
- err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans);
- if (err)
- return err;
- } else {
- err = ds->drv->port_vlan_add(ds, p->port, vlan, trans);
- if (err)
- return err;
+ return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans);
}
+ ds->drv->port_vlan_add(ds, p->port, vlan, trans);
+
return 0;
}
@@ -256,17 +251,17 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- int ret;
- if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add)
- return -EOPNOTSUPP;
+ if (switchdev_trans_ph_prepare(trans)) {
+ if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add)
+ return -EOPNOTSUPP;
- if (switchdev_trans_ph_prepare(trans))
- ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans);
- else
- ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans);
+ return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans);
+ }
- return ret;
+ ds->drv->port_fdb_add(ds, p->port, fdb, trans);
+
+ return 0;
}
static int dsa_slave_port_fdb_del(struct net_device *dev,
@@ -305,16 +300,19 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EOPNOTSUPP;
}
-static int dsa_slave_stp_update(struct net_device *dev, u8 state)
+static int dsa_slave_stp_state_set(struct net_device *dev,
+ const struct switchdev_attr *attr,
+ struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- int ret = -EOPNOTSUPP;
- if (ds->drv->port_stp_update)
- ret = ds->drv->port_stp_update(ds, p->port, state);
+ if (switchdev_trans_ph_prepare(trans))
+ return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP;
- return ret;
+ ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state);
+
+ return 0;
}
static int dsa_slave_vlan_filtering(struct net_device *dev,
@@ -339,17 +337,11 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr,
struct switchdev_trans *trans)
{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
int ret;
switch (attr->id) {
case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
- if (switchdev_trans_ph_prepare(trans))
- ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP;
- else
- ret = ds->drv->port_stp_update(ds, p->port,
- attr->u.stp_state);
+ ret = dsa_slave_stp_state_set(dev, attr, trans);
break;
case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
ret = dsa_slave_vlan_filtering(dev, attr, trans);
@@ -468,7 +460,8 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
* so allow it to be in BR_STATE_FORWARDING to be kept functional
*/
- dsa_slave_stp_update(dev, BR_STATE_FORWARDING);
+ if (ds->drv->port_stp_state_set)
+ ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING);
}
static int dsa_slave_port_attr_get(struct net_device *dev,
@@ -622,8 +615,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev)
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->pd->eeprom_len)
- return ds->pd->eeprom_len;
+ if (ds->cd->eeprom_len)
+ return ds->cd->eeprom_len;
if (ds->drv->get_eeprom_len)
return ds->drv->get_eeprom_len(ds);
@@ -673,6 +666,78 @@ static void dsa_slave_get_strings(struct net_device *dev,
}
}
+static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds = dst->ds[0];
+ s8 cpu_port = dst->cpu_port;
+ int count = 0;
+
+ if (dst->master_ethtool_ops.get_sset_count) {
+ count = dst->master_ethtool_ops.get_sset_count(dev,
+ ETH_SS_STATS);
+ dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data);
+ }
+
+ if (ds->drv->get_ethtool_stats)
+ ds->drv->get_ethtool_stats(ds, cpu_port, data + count);
+}
+
+static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds = dst->ds[0];
+ int count = 0;
+
+ if (dst->master_ethtool_ops.get_sset_count)
+ count += dst->master_ethtool_ops.get_sset_count(dev, sset);
+
+ if (sset == ETH_SS_STATS && ds->drv->get_sset_count)
+ count += ds->drv->get_sset_count(ds);
+
+ return count;
+}
+
+static void dsa_cpu_port_get_strings(struct net_device *dev,
+ uint32_t stringset, uint8_t *data)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds = dst->ds[0];
+ s8 cpu_port = dst->cpu_port;
+ int len = ETH_GSTRING_LEN;
+ int mcount = 0, count;
+ unsigned int i;
+ uint8_t pfx[4];
+ uint8_t *ndata;
+
+ snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port);
+ /* We do not want to be NULL-terminated, since this is a prefix */
+ pfx[sizeof(pfx) - 1] = '_';
+
+ if (dst->master_ethtool_ops.get_sset_count) {
+ mcount = dst->master_ethtool_ops.get_sset_count(dev,
+ ETH_SS_STATS);
+ dst->master_ethtool_ops.get_strings(dev, stringset, data);
+ }
+
+ if (stringset == ETH_SS_STATS && ds->drv->get_strings) {
+ ndata = data + mcount * len;
+ /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
+ * the output after to prepend our CPU port prefix we
+ * constructed earlier
+ */
+ ds->drv->get_strings(ds, cpu_port, ndata);
+ count = ds->drv->get_sset_count(ds);
+ for (i = 0; i < count; i++) {
+ memmove(ndata + (i * len + sizeof(pfx)),
+ ndata + i * len, len - sizeof(pfx));
+ memcpy(ndata + i * len, pfx, sizeof(pfx));
+ }
+ }
+}
+
static void dsa_slave_get_ethtool_stats(struct net_device *dev,
struct ethtool_stats *stats,
uint64_t *data)
@@ -680,10 +745,10 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- data[0] = p->dev->stats.tx_packets;
- data[1] = p->dev->stats.tx_bytes;
- data[2] = p->dev->stats.rx_packets;
- data[3] = p->dev->stats.rx_bytes;
+ data[0] = dev->stats.tx_packets;
+ data[1] = dev->stats.tx_bytes;
+ data[2] = dev->stats.rx_packets;
+ data[3] = dev->stats.rx_bytes;
if (ds->drv->get_ethtool_stats != NULL)
ds->drv->get_ethtool_stats(ds, p->port, data + 4);
}
@@ -828,6 +893,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_eee = dsa_slave_get_eee,
};
+static struct ethtool_ops dsa_cpu_port_ethtool_ops;
+
static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
@@ -932,7 +999,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
struct net_device *slave_dev)
{
struct dsa_switch *ds = p->parent;
- struct dsa_chip_data *cd = ds->pd;
+ struct dsa_chip_data *cd = ds->cd;
struct device_node *phy_dn, *port_dn;
bool phy_is_fixed = false;
u32 phy_flags = 0;
@@ -1045,6 +1112,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
int port, char *name)
{
struct net_device *master = ds->dst->master_netdev;
+ struct dsa_switch_tree *dst = ds->dst;
struct net_device *slave_dev;
struct dsa_slave_priv *p;
int ret;
@@ -1056,6 +1124,19 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
slave_dev->features = master->vlan_features;
slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
+ if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) {
+ memcpy(&dst->master_ethtool_ops, master->ethtool_ops,
+ sizeof(struct ethtool_ops));
+ memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops,
+ sizeof(struct ethtool_ops));
+ dsa_cpu_port_ethtool_ops.get_sset_count =
+ dsa_cpu_port_get_sset_count;
+ dsa_cpu_port_ethtool_ops.get_ethtool_stats =
+ dsa_cpu_port_get_ethtool_stats;
+ dsa_cpu_port_ethtool_ops.get_strings =
+ dsa_cpu_port_get_strings;
+ master->ethtool_ops = &dsa_cpu_port_ethtool_ops;
+ }
eth_hw_addr_inherit(slave_dev, master);
slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
@@ -1066,11 +1147,10 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
NULL);
SET_NETDEV_DEV(slave_dev, parent);
- slave_dev->dev.of_node = ds->pd->port_dn[port];
+ slave_dev->dev.of_node = ds->cd->port_dn[port];
slave_dev->vlan_features = master->vlan_features;
p = netdev_priv(slave_dev);
- p->dev = slave_dev;
p->parent = ds;
p->port = port;
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
index 0d3d70905..4b683fd0a 100644
--- a/net/hsr/Kconfig
+++ b/net/hsr/Kconfig
@@ -18,8 +18,9 @@ config HSR
earlier.
This code is a "best effort" to comply with the HSR standard as
- described in IEC 62439-3:2010 (HSRv0), but no compliancy tests have
- been made.
+ described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1),
+ but no compliancy tests have been made. Use iproute2 to select
+ the version you desire.
You need to perform any and all necessary tests yourself before
relying on this code in a safety critical system!
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index c7d1adca3..16737cd8d 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -90,7 +90,8 @@ static void hsr_check_announce(struct net_device *hsr_dev,
hsr = netdev_priv(hsr_dev);
- if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) {
+ if ((hsr_dev->operstate == IF_OPER_UP)
+ && (old_operstate != IF_OPER_UP)) {
/* Went up */
hsr->announce_count = 0;
hsr->announce_timer.expires = jiffies +
@@ -250,31 +251,22 @@ static const struct header_ops hsr_header_ops = {
.parse = eth_header_parse,
};
-
-/* HSR:2010 supervision frames should be padded so that the whole frame,
- * including headers and FCS, is 64 bytes (without VLAN).
- */
-static int hsr_pad(int size)
-{
- const int min_size = ETH_ZLEN - HSR_HLEN - ETH_HLEN;
-
- if (size >= min_size)
- return size;
- return min_size;
-}
-
-static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
+static void send_hsr_supervision_frame(struct hsr_port *master,
+ u8 type, u8 hsrVer)
{
struct sk_buff *skb;
int hlen, tlen;
+ struct hsr_tag *hsr_tag;
struct hsr_sup_tag *hsr_stag;
struct hsr_sup_payload *hsr_sp;
unsigned long irqflags;
hlen = LL_RESERVED_SPACE(master->dev);
tlen = master->dev->needed_tailroom;
- skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen,
- GFP_ATOMIC);
+ skb = dev_alloc_skb(
+ sizeof(struct hsr_tag) +
+ sizeof(struct hsr_sup_tag) +
+ sizeof(struct hsr_sup_payload) + hlen + tlen);
if (skb == NULL)
return;
@@ -282,32 +274,48 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
skb_reserve(skb, hlen);
skb->dev = master->dev;
- skb->protocol = htons(ETH_P_PRP);
+ skb->protocol = htons(hsrVer ? ETH_P_HSR : ETH_P_PRP);
skb->priority = TC_PRIO_CONTROL;
- if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+ if (dev_hard_header(skb, skb->dev, (hsrVer ? ETH_P_HSR : ETH_P_PRP),
master->hsr->sup_multicast_addr,
skb->dev->dev_addr, skb->len) <= 0)
goto out;
skb_reset_mac_header(skb);
- hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag));
+ if (hsrVer > 0) {
+ hsr_tag = (typeof(hsr_tag)) skb_put(skb, sizeof(struct hsr_tag));
+ hsr_tag->encap_proto = htons(ETH_P_PRP);
+ set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE);
+ }
- set_hsr_stag_path(hsr_stag, 0xf);
- set_hsr_stag_HSR_Ver(hsr_stag, 0);
+ hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(struct hsr_sup_tag));
+ set_hsr_stag_path(hsr_stag, (hsrVer ? 0x0 : 0xf));
+ set_hsr_stag_HSR_Ver(hsr_stag, hsrVer);
+ /* From HSRv1 on we have separate supervision sequence numbers. */
spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags);
- hsr_stag->sequence_nr = htons(master->hsr->sequence_nr);
- master->hsr->sequence_nr++;
+ if (hsrVer > 0) {
+ hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr);
+ hsr_tag->sequence_nr = htons(master->hsr->sequence_nr);
+ master->hsr->sup_sequence_nr++;
+ master->hsr->sequence_nr++;
+ } else {
+ hsr_stag->sequence_nr = htons(master->hsr->sequence_nr);
+ master->hsr->sequence_nr++;
+ }
spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags);
hsr_stag->HSR_TLV_Type = type;
- hsr_stag->HSR_TLV_Length = 12;
+ /* TODO: Why 12 in HSRv0? */
+ hsr_stag->HSR_TLV_Length = hsrVer ? sizeof(struct hsr_sup_payload) : 12;
/* Payload: MacAddressA */
- hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp));
+ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(struct hsr_sup_payload));
ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr);
+ skb_put_padto(skb, ETH_ZLEN + HSR_HLEN);
+
hsr_forward_skb(skb, master);
return;
@@ -329,19 +337,20 @@ static void hsr_announce(unsigned long data)
rcu_read_lock();
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
- if (hsr->announce_count < 3) {
- send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE);
+ if (hsr->announce_count < 3 && hsr->protVersion == 0) {
+ send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE,
+ hsr->protVersion);
hsr->announce_count++;
- } else {
- send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK);
- }
- if (hsr->announce_count < 3)
hsr->announce_timer.expires = jiffies +
msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
- else
+ } else {
+ send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK,
+ hsr->protVersion);
+
hsr->announce_timer.expires = jiffies +
msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+ }
if (is_admin_up(master->dev))
add_timer(&hsr->announce_timer);
@@ -428,7 +437,7 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
};
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec)
+ unsigned char multicast_spec, u8 protocol_version)
{
struct hsr_priv *hsr;
struct hsr_port *port;
@@ -450,18 +459,17 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
spin_lock_init(&hsr->seqnr_lock);
/* Overflow soon to find bugs easier: */
hsr->sequence_nr = HSR_SEQNR_START;
+ hsr->sup_sequence_nr = HSR_SUP_SEQNR_START;
- init_timer(&hsr->announce_timer);
- hsr->announce_timer.function = hsr_announce;
- hsr->announce_timer.data = (unsigned long) hsr;
+ setup_timer(&hsr->announce_timer, hsr_announce, (unsigned long)hsr);
- init_timer(&hsr->prune_timer);
- hsr->prune_timer.function = hsr_prune_nodes;
- hsr->prune_timer.data = (unsigned long) hsr;
+ setup_timer(&hsr->prune_timer, hsr_prune_nodes, (unsigned long)hsr);
ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr);
hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
+ hsr->protVersion = protocol_version;
+
/* FIXME: should I modify the value of these?
*
* - hsr_dev->flags - i.e.
@@ -490,8 +498,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
if (res)
goto fail;
- hsr->prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD);
- add_timer(&hsr->prune_timer);
+ mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));
return 0;
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 108a5d59d..9975e31bb 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -17,7 +17,7 @@
void hsr_dev_setup(struct net_device *dev);
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec);
+ unsigned char multicast_spec, u8 protocol_version);
void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
bool is_hsr_master(struct net_device *dev);
int hsr_get_max_mtu(struct hsr_priv *hsr);
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 7871ed6d3..5ee1d43f1 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -50,21 +50,40 @@ struct hsr_frame_info {
*/
static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
{
- struct hsr_ethhdr_sp *hdr;
+ struct ethhdr *ethHdr;
+ struct hsr_sup_tag *hsrSupTag;
+ struct hsrv1_ethhdr_sp *hsrV1Hdr;
WARN_ON_ONCE(!skb_mac_header_was_set(skb));
- hdr = (struct hsr_ethhdr_sp *) skb_mac_header(skb);
+ ethHdr = (struct ethhdr *) skb_mac_header(skb);
- if (!ether_addr_equal(hdr->ethhdr.h_dest,
+ /* Correct addr? */
+ if (!ether_addr_equal(ethHdr->h_dest,
hsr->sup_multicast_addr))
return false;
- if (get_hsr_stag_path(&hdr->hsr_sup) != 0x0f)
+ /* Correct ether type?. */
+ if (!(ethHdr->h_proto == htons(ETH_P_PRP)
+ || ethHdr->h_proto == htons(ETH_P_HSR)))
return false;
- if ((hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
- (hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
+
+ /* Get the supervision header from correct location. */
+ if (ethHdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */
+ hsrV1Hdr = (struct hsrv1_ethhdr_sp *) skb_mac_header(skb);
+ if (hsrV1Hdr->hsr.encap_proto != htons(ETH_P_PRP))
+ return false;
+
+ hsrSupTag = &hsrV1Hdr->hsr_sup;
+ } else {
+ hsrSupTag = &((struct hsrv0_ethhdr_sp *) skb_mac_header(skb))->hsr_sup;
+ }
+
+ if ((hsrSupTag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
+ (hsrSupTag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
return false;
- if (hdr->hsr_sup.HSR_TLV_Length != 12)
+ if ((hsrSupTag->HSR_TLV_Length != 12) &&
+ (hsrSupTag->HSR_TLV_Length !=
+ sizeof(struct hsr_sup_payload)))
return false;
return true;
@@ -110,7 +129,7 @@ static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame,
static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame,
- struct hsr_port *port)
+ struct hsr_port *port, u8 protoVersion)
{
struct hsr_ethhdr *hsr_ethhdr;
int lane_id;
@@ -131,7 +150,8 @@ static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame,
set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size);
hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr);
hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
- hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP);
+ hsr_ethhdr->ethhdr.h_proto = htons(protoVersion ?
+ ETH_P_HSR : ETH_P_PRP);
}
static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
@@ -160,7 +180,7 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
memmove(dst, src, movelen);
skb_reset_mac_header(skb);
- hsr_fill_tag(skb, frame, port);
+ hsr_fill_tag(skb, frame, port, port->hsr->protVersion);
return skb;
}
@@ -320,7 +340,8 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame,
/* FIXME: */
WARN_ONCE(1, "HSR: VLAN not yet supported");
}
- if (ethhdr->h_proto == htons(ETH_P_PRP)) {
+ if (ethhdr->h_proto == htons(ETH_P_PRP)
+ || ethhdr->h_proto == htons(ETH_P_HSR)) {
frame->skb_std = NULL;
frame->skb_hsr = skb;
frame->sequence_nr = hsr_get_skb_sequence_nr(skb);
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index bace124d1..7ea925816 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -177,17 +177,17 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
return node;
}
- if (!is_sup)
- return NULL; /* Only supervision frame may create node entry */
+ /* Everyone may create a node entry, connected node to a HSR device. */
- if (ethhdr->h_proto == htons(ETH_P_PRP)) {
+ if (ethhdr->h_proto == htons(ETH_P_PRP)
+ || ethhdr->h_proto == htons(ETH_P_HSR)) {
/* Use the existing sequence_nr from the tag as starting point
* for filtering duplicate frames.
*/
seq_out = hsr_get_skb_sequence_nr(skb) - 1;
} else {
WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
- seq_out = 0;
+ seq_out = HSR_SEQNR_START;
}
return hsr_add_node(node_db, ethhdr->h_source, seq_out);
@@ -200,17 +200,25 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
struct hsr_port *port_rcv)
{
+ struct ethhdr *ethhdr;
struct hsr_node *node_real;
struct hsr_sup_payload *hsr_sp;
struct list_head *node_db;
int i;
- skb_pull(skb, sizeof(struct hsr_ethhdr_sp));
- hsr_sp = (struct hsr_sup_payload *) skb->data;
+ ethhdr = (struct ethhdr *) skb_mac_header(skb);
- if (ether_addr_equal(eth_hdr(skb)->h_source, hsr_sp->MacAddressA))
- /* Not sent from MacAddressB of a PICS_SUBS capable node */
- goto done;
+ /* Leave the ethernet header. */
+ skb_pull(skb, sizeof(struct ethhdr));
+
+ /* And leave the HSR tag. */
+ if (ethhdr->h_proto == htons(ETH_P_HSR))
+ skb_pull(skb, sizeof(struct hsr_tag));
+
+ /* And leave the HSR sup tag. */
+ skb_pull(skb, sizeof(struct hsr_sup_tag));
+
+ hsr_sp = (struct hsr_sup_payload *) skb->data;
/* Merge node_curr (registered on MacAddressB) into node_real */
node_db = &port_rcv->hsr->node_db;
@@ -225,7 +233,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
/* Node has already been merged */
goto done;
- ether_addr_copy(node_real->MacAddressB, eth_hdr(skb)->h_source);
+ ether_addr_copy(node_real->MacAddressB, ethhdr->h_source);
for (i = 0; i < HSR_PT_PORTS; i++) {
if (!node_curr->time_in_stale[i] &&
time_after(node_curr->time_in[i], node_real->time_in[i])) {
@@ -241,7 +249,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
kfree_rcu(node_curr, rcu_head);
done:
- skb_push(skb, sizeof(struct hsr_ethhdr_sp));
+ skb_push(skb, sizeof(struct hsrv1_ethhdr_sp));
}
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 5a9c69962..9b9909e89 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -30,6 +30,7 @@
*/
#define MAX_SLAVE_DIFF 3000 /* ms */
#define HSR_SEQNR_START (USHRT_MAX - 1024)
+#define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2)
/* How often shall we check for broken ring and remove node entries older than
@@ -58,6 +59,8 @@ struct hsr_tag {
#define HSR_HLEN 6
+#define HSR_V1_SUP_LSDUSIZE 52
+
/* The helper functions below assumes that 'path' occupies the 4 most
* significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
* equivalently, the 4 most significant bits of HSR tag byte 14).
@@ -131,8 +134,14 @@ static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver)
set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver);
}
-struct hsr_ethhdr_sp {
+struct hsrv0_ethhdr_sp {
+ struct ethhdr ethhdr;
+ struct hsr_sup_tag hsr_sup;
+} __packed;
+
+struct hsrv1_ethhdr_sp {
struct ethhdr ethhdr;
+ struct hsr_tag hsr;
struct hsr_sup_tag hsr_sup;
} __packed;
@@ -162,6 +171,8 @@ struct hsr_priv {
struct timer_list prune_timer;
int announce_count;
u16 sequence_nr;
+ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */
+ u8 protVersion; /* Indicate if HSRv0 or HSRv1. */
spinlock_t seqnr_lock; /* locking for sequence_nr */
unsigned char sup_multicast_addr[ETH_ALEN];
};
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index a2c7e4c0a..d4d1617f4 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -23,7 +23,8 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
[IFLA_HSR_SLAVE1] = { .type = NLA_U32 },
[IFLA_HSR_SLAVE2] = { .type = NLA_U32 },
[IFLA_HSR_MULTICAST_SPEC] = { .type = NLA_U8 },
- [IFLA_HSR_SUPERVISION_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [IFLA_HSR_VERSION] = { .type = NLA_U8 },
+ [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN },
[IFLA_HSR_SEQ_NR] = { .type = NLA_U16 },
};
@@ -35,7 +36,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net_device *link[2];
- unsigned char multicast_spec;
+ unsigned char multicast_spec, hsr_version;
if (!data) {
netdev_info(dev, "HSR: No slave devices specified\n");
@@ -62,7 +63,12 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
else
multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]);
- return hsr_dev_finalize(dev, link, multicast_spec);
+ if (!data[IFLA_HSR_VERSION])
+ hsr_version = 0;
+ else
+ hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]);
+
+ return hsr_dev_finalize(dev, link, multicast_spec, hsr_version);
}
static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
@@ -115,10 +121,9 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = {
/* attribute policy */
-/* NLA_BINARY missing in libnl; use NLA_UNSPEC in userspace instead. */
static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
- [HSR_A_NODE_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
- [HSR_A_NODE_ADDR_B] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [HSR_A_NODE_ADDR] = { .len = ETH_ALEN },
+ [HSR_A_NODE_ADDR_B] = { .len = ETH_ALEN },
[HSR_A_IFINDEX] = { .type = NLA_U32 },
[HSR_A_IF1_AGE] = { .type = NLA_U32 },
[HSR_A_IF2_AGE] = { .type = NLA_U32 },
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 7d37366cc..f5b60388d 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -22,6 +22,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
struct hsr_port *port;
+ u16 protocol;
if (!skb_mac_header_was_set(skb)) {
WARN_ONCE(1, "%s: skb invalid", __func__);
@@ -37,7 +38,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
goto finish_consume;
}
- if (eth_hdr(skb)->h_proto != htons(ETH_P_PRP))
+ protocol = eth_hdr(skb)->h_proto;
+ if (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR))
goto finish_pass;
skb_push(skb, ETH_HLEN);
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index b4e17a7c0..5ac778962 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -41,24 +41,12 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
return (((__force u64)a->extended_addr) >> 32) ^
(((__force u64)a->extended_addr) & 0xffffffff);
case IEEE802154_ADDR_SHORT:
- return (__force u32)(a->short_addr);
+ return (__force u32)(a->short_addr + (a->pan_id << 16));
default:
return 0;
}
}
-/* private device info */
-struct lowpan_dev_info {
- struct net_device *wdev; /* wpan device ptr */
- u16 fragment_tag;
-};
-
-static inline struct
-lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
-{
- return (struct lowpan_dev_info *)lowpan_priv(dev)->priv;
-}
-
int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
void lowpan_net_frag_exit(void);
int lowpan_net_frag_init(void);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 0023c9048..dd085db85 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -148,7 +148,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
return -EBUSY;
}
- lowpan_dev_info(ldev)->wdev = wdev;
+ lowpan_802154_dev(ldev)->wdev = wdev;
/* Set the lowpan hardware address to the wpan hardware address. */
memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN);
/* We need headroom for possible wpan_dev_hard_header call and tailroom
@@ -173,7 +173,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
static void lowpan_dellink(struct net_device *ldev, struct list_head *head)
{
- struct net_device *wdev = lowpan_dev_info(ldev)->wdev;
+ struct net_device *wdev = lowpan_802154_dev(ldev)->wdev;
ASSERT_RTNL();
@@ -184,7 +184,7 @@ static void lowpan_dellink(struct net_device *ldev, struct list_head *head)
static struct rtnl_link_ops lowpan_link_ops __read_mostly = {
.kind = "lowpan",
- .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)),
+ .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)),
.setup = lowpan_setup,
.newlink = lowpan_newlink,
.dellink = lowpan_dellink,
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index d4353face..e459afd16 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -84,7 +84,7 @@ static struct sk_buff*
lowpan_alloc_frag(struct sk_buff *skb, int size,
const struct ieee802154_hdr *master_hdr, bool frag1)
{
- struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev;
+ struct net_device *wdev = lowpan_802154_dev(skb->dev)->wdev;
struct sk_buff *frag;
int rc;
@@ -148,8 +148,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev,
int frag_cap, frag_len, payload_cap, rc;
int skb_unprocessed, skb_offset;
- frag_tag = htons(lowpan_dev_info(ldev)->fragment_tag);
- lowpan_dev_info(ldev)->fragment_tag++;
+ frag_tag = htons(lowpan_802154_dev(ldev)->fragment_tag);
+ lowpan_802154_dev(ldev)->fragment_tag++;
frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07);
frag_hdr[1] = dgram_size & 0xff;
@@ -208,7 +208,7 @@ err:
static int lowpan_header(struct sk_buff *skb, struct net_device *ldev,
u16 *dgram_size, u16 *dgram_offset)
{
- struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr;
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
struct ieee802154_addr sa, da;
struct ieee802154_mac_cb *cb = mac_cb_init(skb);
struct lowpan_addr_info info;
@@ -248,8 +248,8 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev,
cb->ackreq = wpan_dev->ackreq;
}
- return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa,
- 0);
+ return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da,
+ &sa, 0);
}
netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
@@ -283,7 +283,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
max_single = ieee802154_max_payload(&wpan_hdr);
if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) {
- skb->dev = lowpan_dev_info(ldev)->wdev;
+ skb->dev = lowpan_802154_dev(ldev)->wdev;
ldev->stats.tx_packets++;
ldev->stats.tx_bytes += dgram_size;
return dev_queue_xmit(skb);
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 3503c3895..d3cbb3258 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -34,9 +34,11 @@
#include "ieee802154.h"
-static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr)
+static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr,
+ int padattr)
{
- return nla_put_u64(msg, type, swab64((__force u64)hwaddr));
+ return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr),
+ padattr);
}
static __le64 nla_get_hwaddr(const struct nlattr *nla)
@@ -623,7 +625,8 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg,
if (desc->device_addr.mode == IEEE802154_ADDR_LONG &&
nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR,
- desc->device_addr.extended_addr))
+ desc->device_addr.extended_addr,
+ IEEE802154_ATTR_PAD))
return -EMSGSIZE;
}
@@ -638,7 +641,7 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg,
if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
- desc->extended_source))
+ desc->extended_source, IEEE802154_ATTR_PAD))
return -EMSGSIZE;
return 0;
@@ -1063,7 +1066,8 @@ ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq,
nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) ||
nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
desc->short_addr) ||
- nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) ||
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr,
+ IEEE802154_ATTR_PAD) ||
nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
desc->frame_counter) ||
nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
@@ -1167,7 +1171,8 @@ ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq,
if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
- nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) ||
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr,
+ IEEE802154_ATTR_PAD) ||
nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
devkey->frame_counter) ||
ieee802154_llsec_fill_key_id(msg, &devkey->key_id))
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 16ef0d9f5..116187b5c 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -722,7 +722,8 @@ ieee802154_llsec_send_key_id(struct sk_buff *msg,
break;
case NL802154_DEV_ADDR_EXTENDED:
if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED,
- desc->device_addr.extended_addr))
+ desc->device_addr.extended_addr,
+ NL802154_DEV_ADDR_ATTR_PAD))
return -ENOBUFS;
break;
default:
@@ -742,7 +743,8 @@ ieee802154_llsec_send_key_id(struct sk_buff *msg,
break;
case NL802154_KEY_ID_MODE_INDEX_EXTENDED:
if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED,
- desc->extended_source))
+ desc->extended_source,
+ NL802154_KEY_ID_ATTR_PAD))
return -ENOBUFS;
break;
default:
@@ -811,7 +813,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) ||
- nla_put_u64(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev)) ||
+ nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+ wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) ||
nla_put_u32(msg, NL802154_ATTR_GENERATION,
rdev->devlist_generation ^
(cfg802154_rdev_list_generation << 2)))
@@ -819,7 +822,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
/* address settings */
if (nla_put_le64(msg, NL802154_ATTR_EXTENDED_ADDR,
- wpan_dev->extended_addr) ||
+ wpan_dev->extended_addr,
+ NL802154_ATTR_PAD) ||
nla_put_le16(msg, NL802154_ATTR_SHORT_ADDR,
wpan_dev->short_addr) ||
nla_put_le16(msg, NL802154_ATTR_PAN_ID, wpan_dev->pan_id))
@@ -1074,6 +1078,11 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
if (netif_running(dev))
return -EBUSY;
+ if (wpan_dev->lowpan_dev) {
+ if (netif_running(wpan_dev->lowpan_dev))
+ return -EBUSY;
+ }
+
/* don't change address fields on monitor */
if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR ||
!info->attrs[NL802154_ATTR_PAN_ID])
@@ -1105,6 +1114,11 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info)
if (netif_running(dev))
return -EBUSY;
+ if (wpan_dev->lowpan_dev) {
+ if (netif_running(wpan_dev->lowpan_dev))
+ return -EBUSY;
+ }
+
/* don't change address fields on monitor */
if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR ||
!info->attrs[NL802154_ATTR_SHORT_ADDR])
@@ -1275,8 +1289,8 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
nl802154_dev_addr_policy))
return -EINVAL;
- if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] &&
- !attrs[NL802154_DEV_ADDR_ATTR_MODE] &&
+ if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] ||
+ !attrs[NL802154_DEV_ADDR_ATTR_MODE] ||
!(attrs[NL802154_DEV_ADDR_ATTR_SHORT] ||
attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]))
return -EINVAL;
@@ -1614,7 +1628,7 @@ static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid,
nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR,
dev_desc->short_addr) ||
nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR,
- dev_desc->hwaddr) ||
+ dev_desc->hwaddr, NL802154_DEV_ATTR_PAD) ||
nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT,
dev_desc->seclevel_exempt) ||
nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode))
@@ -1778,7 +1792,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid,
goto nla_put_failure;
if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR,
- extended_addr) ||
+ extended_addr, NL802154_DEVKEY_ATTR_PAD) ||
nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER,
devkey->frame_counter))
goto nla_put_failure;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7ad0e567c..d39e9e47a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
+ .set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -1106,7 +1107,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct ip_options_rcu *inet_opt;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
@@ -1191,35 +1192,19 @@ int inet_sk_rebuild_header(struct sock *sk)
}
EXPORT_SYMBOL(inet_sk_rebuild_header);
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
+ bool udpfrag = false, fixedid = false, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
- bool udpfrag, encap;
struct iphdr *iph;
- int proto;
+ int proto, tot_len;
int nhoff;
int ihl;
int id;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- 0)))
- goto out;
-
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
@@ -1247,11 +1232,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (skb->encapsulation &&
- skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
- udpfrag = proto == IPPROTO_UDP && encap;
- else
- udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+ if (!skb->encapsulation || encap) {
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+ fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+
+ /* fixed ID is invalid if DF bit is not set */
+ if (fixedid && !(iph->frag_off & htons(IP_DF)))
+ goto out;
+ }
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
@@ -1264,15 +1252,25 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (udpfrag) {
- iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
+ tot_len = skb->len - nhoff;
+ } else if (skb_is_gso(skb)) {
+ if (!fixedid) {
+ iph->id = htons(id);
+ id += skb_shinfo(skb)->gso_segs;
+ }
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
} else {
- iph->id = htons(id++);
+ if (!fixedid)
+ iph->id = htons(id++);
+ tot_len = skb->len - nhoff;
}
- iph->tot_len = htons(skb->len - nhoff);
+ iph->tot_len = htons(tot_len);
ip_send_check(iph);
if (encap)
skb_reset_inner_headers(skb);
@@ -1282,9 +1280,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
out:
return segs;
}
+EXPORT_SYMBOL(inet_gso_segment);
-static struct sk_buff **inet_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
@@ -1324,6 +1322,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
+ u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1347,16 +1346,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
- /* Save the IP ID check to be included later when we get to
- * the transport layer so only the inner most IP ID is checked.
- * This is because some GSO/TSO implementations do not
- * correctly increment the IP ID for the outer hdrs.
- */
- NAPI_GRO_CB(p)->flush_id =
- ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
+
+ /* We need to store of the IP ID check to be included later
+ * when we can verify that this packet does in fact belong
+ * to a given flow.
+ */
+ flush_id = (u16)(id - ntohs(iph2->id));
+
+ /* This bit of code makes it much easier for us to identify
+ * the cases where we are doing atomic vs non-atomic IP ID
+ * checks. Specifically an atomic check can return IP ID
+ * values 0 - 0xFFFF, while a non-atomic check can only
+ * return 0 or 0xFFFF.
+ */
+ if (!NAPI_GRO_CB(p)->is_atomic ||
+ !(iph->frag_off & htons(IP_DF))) {
+ flush_id ^= NAPI_GRO_CB(p)->count;
+ flush_id = flush_id ? 0xFFFF : 0;
+ }
+
+ /* If the previous IP ID value was based on an atomic
+ * datagram we can overwrite the value and ignore it.
+ */
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ NAPI_GRO_CB(p)->flush_id = flush_id;
+ else
+ NAPI_GRO_CB(p)->flush_id |= flush_id;
}
+ NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
@@ -1379,6 +1398,7 @@ out:
return pp;
}
+EXPORT_SYMBOL(inet_gro_receive);
static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
@@ -1430,7 +1450,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
return -EINVAL;
}
-static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
__be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
@@ -1460,11 +1480,12 @@ out_unlock:
return err;
}
+EXPORT_SYMBOL(inet_gro_complete);
static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
skb->encapsulation = 1;
- skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
return inet_gro_complete(skb, nhoff);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c34c7544d..89a8cac47 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -436,7 +436,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
if (IS_ERR(rt))
return 1;
if (rt->dst.dev != dev) {
- NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
+ __NET_INC_STATS(net, LINUX_MIB_ARPFILTER);
flag = 1;
}
ip_rt_put(rt);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bdb2a07ec..40d6b8771 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk,
sk_inet = inet_sk(sk);
- old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+ old = rcu_dereference_protected(sk_inet->inet_opt,
+ lockdep_sock_is_held(sk));
if (sk_inet->is_icsk) {
sk_conn = inet_csk(sk);
if (old)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 63566ec54..ef2ebeb89 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -110,6 +110,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
}
+EXPORT_SYMBOL_GPL(fib_new_table);
/* caller must hold either rtnl or rcu read lock */
struct fib_table *fib_get_table(struct net *net, u32 id)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2b68418c7..539fa264e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (!rtnh_ok(rtnh, remaining))
return -EINVAL;
+ if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ return -EINVAL;
+
nexthop_nh->nh_flags =
(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
@@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
goto err_inval;
+ if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ goto err_inval;
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (cfg->fc_mp) {
nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
@@ -1561,21 +1567,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static bool fib_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+
+ if (nh->nh_scope == RT_SCOPE_LINK) {
+ struct neighbour *n;
+
+ rcu_read_lock_bh();
+
+ n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
+ if (n)
+ state = n->nud_state;
+
+ rcu_read_unlock_bh();
+ }
+
+ return !!(state & NUD_VALID);
+}
void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
+ struct net *net = fi->fib_net;
+ bool first = false;
for_nexthops(fi) {
if (hash > atomic_read(&nh->nh_upper_bound))
continue;
- res->nh_sel = nhsel;
- return;
+ if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
+ fib_good_nh(nh)) {
+ res->nh_sel = nhsel;
+ return;
+ }
+ if (!first) {
+ res->nh_sel = nhsel;
+ first = true;
+ }
} endfor_nexthops(fi);
-
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
}
#endif
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a6962ccad..5f9207c03 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -21,8 +21,8 @@ struct fou {
u8 protocol;
u8 flags;
__be16 port;
+ u8 family;
u16 type;
- struct udp_offload udp_offloads;
struct list_head list;
struct rcu_head rcu;
};
@@ -48,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static int fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
{
- struct iphdr *iph = ip_hdr(skb);
-
/* Remove 'len' bytes from the packet (UDP header and
* FOU header if present).
*/
- iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
@@ -69,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- if (fou_recv_pull(skb, sizeof(struct udphdr)))
+ if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
goto drop;
return -fou->protocol;
@@ -142,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
hdrlen = sizeof(struct guehdr) + optlen;
- ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
/* Pull csum through the guehdr now . This can be used if
* there is a remote checksum offload.
@@ -186,13 +193,13 @@ drop:
return 0;
}
-static struct sk_buff **fou_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
+static struct sk_buff **fou_gro_receive(struct sock *sk,
+ struct sk_buff **head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
- u8 proto = NAPI_GRO_CB(skb)->proto;
+ u8 proto = fou_from_sock(sk)->protocol;
const struct net_offload **offloads;
/* We can clear the encap_mark for FOU as we are essentially doing
@@ -220,11 +227,11 @@ out_unlock:
return pp;
}
-static int fou_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
+static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
+ int nhoff)
{
const struct net_offload *ops;
- u8 proto = NAPI_GRO_CB(skb)->proto;
+ u8 proto = fou_from_sock(sk)->protocol;
int err = -ENOSYS;
const struct net_offload **offloads;
@@ -267,9 +274,9 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return guehdr;
}
-static struct sk_buff **gue_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
+static struct sk_buff **gue_gro_receive(struct sock *sk,
+ struct sk_buff **head,
+ struct sk_buff *skb)
{
const struct net_offload **offloads;
const struct net_offload *ops;
@@ -280,7 +287,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
void *data;
u16 doffset = 0;
int flush = 1;
- struct fou *fou = container_of(uoff, struct fou, udp_offloads);
+ struct fou *fou = fou_from_sock(sk);
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc);
@@ -392,8 +399,7 @@ out:
return pp;
}
-static int gue_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
+static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
@@ -428,7 +434,8 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (fou->port == fout->port) {
+ if (fou->port == fout->port &&
+ fou->family == fout->family) {
mutex_unlock(&fn->fou_lock);
return -EALREADY;
}
@@ -443,44 +450,20 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
static void fou_release(struct fou *fou)
{
struct socket *sock = fou->sock;
- struct sock *sk = sock->sk;
- if (sk->sk_family == AF_INET)
- udp_del_offload(&fou->udp_offloads);
list_del(&fou->list);
udp_tunnel_sock_release(sock);
kfree_rcu(fou, rcu);
}
-static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
- udp_sk(sk)->encap_rcv = fou_udp_recv;
- fou->protocol = cfg->protocol;
- fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
- fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
- fou->udp_offloads.port = cfg->udp_config.local_udp_port;
- fou->udp_offloads.ipproto = cfg->protocol;
-
- return 0;
-}
-
-static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
- udp_sk(sk)->encap_rcv = gue_udp_recv;
- fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
- fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
- fou->udp_offloads.port = cfg->udp_config.local_udp_port;
-
- return 0;
-}
-
static int fou_create(struct net *net, struct fou_cfg *cfg,
struct socket **sockp)
{
struct socket *sock = NULL;
struct fou *fou = NULL;
struct sock *sk;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
int err;
/* Open UDP socket */
@@ -497,44 +480,39 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk = sock->sk;
- fou->flags = cfg->flags;
fou->port = cfg->udp_config.local_udp_port;
+ fou->family = cfg->udp_config.family;
+ fou->flags = cfg->flags;
+ fou->type = cfg->type;
+ fou->sock = sock;
+
+ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.sk_user_data = fou;
+ tunnel_cfg.encap_destroy = NULL;
/* Initial for fou type */
switch (cfg->type) {
case FOU_ENCAP_DIRECT:
- err = fou_encap_init(sk, fou, cfg);
- if (err)
- goto error;
+ tunnel_cfg.encap_rcv = fou_udp_recv;
+ tunnel_cfg.gro_receive = fou_gro_receive;
+ tunnel_cfg.gro_complete = fou_gro_complete;
+ fou->protocol = cfg->protocol;
break;
case FOU_ENCAP_GUE:
- err = gue_encap_init(sk, fou, cfg);
- if (err)
- goto error;
+ tunnel_cfg.encap_rcv = gue_udp_recv;
+ tunnel_cfg.gro_receive = gue_gro_receive;
+ tunnel_cfg.gro_complete = gue_gro_complete;
break;
default:
err = -EINVAL;
goto error;
}
- fou->type = cfg->type;
-
- udp_sk(sk)->encap_type = 1;
- udp_encap_enable();
-
- sk->sk_user_data = fou;
- fou->sock = sock;
-
- inet_inc_convert_csum(sk);
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
sk->sk_allocation = GFP_ATOMIC;
- if (cfg->udp_config.family == AF_INET) {
- err = udp_add_offload(net, &fou->udp_offloads);
- if (err)
- goto error;
- }
-
err = fou_add_to_port_list(net, fou);
if (err)
goto error;
@@ -556,12 +534,13 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
{
struct fou_net *fn = net_generic(net, fou_net_id);
__be16 port = cfg->udp_config.local_udp_port;
+ u8 family = cfg->udp_config.family;
int err = -EINVAL;
struct fou *fou;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fou, &fn->fou_list, list) {
- if (fou->port == port) {
+ if (fou->port == port && fou->family == family) {
fou_release(fou);
err = 0;
break;
@@ -599,8 +578,15 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
- if (family != AF_INET)
- return -EINVAL;
+ switch (family) {
+ case AF_INET:
+ break;
+ case AF_INET6:
+ cfg->udp_config.ipv6_v6only = 1;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
cfg->udp_config.family = family;
}
@@ -691,6 +677,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
struct fou_cfg cfg;
struct fou *fout;
__be16 port;
+ u8 family;
int ret;
ret = parse_nl_config(info, &cfg);
@@ -700,6 +687,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
if (port == 0)
return -EINVAL;
+ family = cfg.udp_config.family;
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
@@ -707,7 +698,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
ret = -ESRCH;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (port == fout->port) {
+ if (port == fout->port && family == fout->family) {
ret = fou_dump_info(fout, info->snd_portid,
info->snd_seq, 0, msg,
info->genlhdr->cmd);
@@ -812,36 +803,48 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
*protocol = IPPROTO_UDP;
}
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
+{
+ int err;
+
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
+
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
SKB_GSO_UDP_TUNNEL;
__be16 sport;
+ int err;
- skb = iptunnel_handle_offloads(skb, type);
-
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ err = __fou_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
fou_build_udp(skb, e, fl4, protocol, sport);
return 0;
}
EXPORT_SYMBOL(fou_build_header);
-int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
- u8 *protocol, struct flowi4 *fl4)
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
{
- int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
- SKB_GSO_UDP_TUNNEL;
struct guehdr *guehdr;
size_t hdrlen, optlen = 0;
- __be16 sport;
void *data;
bool need_priv = false;
+ int err;
if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -852,14 +855,13 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
optlen += need_priv ? GUE_LEN_PRIV : 0;
- skb = iptunnel_handle_offloads(skb, type);
-
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
/* Get source port (based on flow hash) before skb_push */
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
hdrlen = sizeof(struct guehdr) + optlen;
@@ -904,6 +906,22 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
}
+ return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+ int err;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
fou_build_udp(skb, e, fl4, protocol, sport);
return 0;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index d9c552a72..de1d119a4 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -60,6 +60,67 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
+/* Fills in tpi and returns header length to be pulled. */
+int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err, __be16 proto, int nhs)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, nhs + hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else {
+ tpi->key = 0;
+ }
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else {
+ tpi->seq = 0;
+ }
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IPv4/IPv6
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = proto;
+ if ((*(u8 *)options & 0xF0) != 0x40)
+ hdr_len += 4;
+ }
+ return hdr_len;
+}
+EXPORT_SYMBOL(gre_parse_header);
+
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a5bd4317..ecd1e09db 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -26,18 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
int gre_offset, outer_hlen;
bool need_csum, ufo;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT)))
- goto out;
-
if (!skb->encapsulation)
goto out;
@@ -86,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb = segs;
do {
struct gre_base_hdr *greh;
- __be32 *pcsum;
+ __sum16 *pcsum;
/* Set up inner headers if we are offloading inner checksum */
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -106,10 +94,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
continue;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
- pcsum = (__be32 *)(greh + 1);
+ pcsum = (__sum16 *)(greh + 1);
+
+ if (skb_is_gso(skb)) {
+ unsigned int partial_adj;
+
+ /* Adjust checksum to account for the fact that
+ * the partial checksum is based on actual size
+ * whereas headers should be based on MSS size.
+ */
+ partial_adj = skb->len + skb_headroom(skb) -
+ SKB_GSO_CB(skb)->data_offset -
+ skb_shinfo(skb)->gso_size;
+ *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+ } else {
+ *pcsum = 0;
+ }
- *pcsum = 0;
- *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ *(pcsum + 1) = 0;
+ *pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
out:
return segs;
@@ -275,6 +278,18 @@ static const struct net_offload gre_offload = {
static int __init gre_offload_init(void)
{
- return inet_add_offload(&gre_offload, IPPROTO_GRE);
+ int err;
+
+ err = inet_add_offload(&gre_offload, IPPROTO_GRE);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (err)
+ return err;
+
+ err = inet6_add_offload(&gre_offload, IPPROTO_GRE);
+ if (err)
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
+#endif
+
+ return err;
}
device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 633348977..38abe70e5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -363,7 +363,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
ipc, rt, MSG_DONTWAIT) < 0) {
- ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
+ __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
ip_flush_pending_frames(sk);
} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = icmp_hdr(skb);
@@ -744,7 +744,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
* avoid additional coding at protocol handlers.
*/
if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
return;
}
@@ -865,7 +865,7 @@ static bool icmp_unreach(struct sk_buff *skb)
out:
return true;
out_err:
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return false;
}
@@ -877,7 +877,7 @@ out_err:
static bool icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
- ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
return false;
}
@@ -956,7 +956,7 @@ static bool icmp_timestamp(struct sk_buff *skb)
return true;
out_err:
- ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
return false;
}
@@ -996,7 +996,7 @@ int icmp_rcv(struct sk_buff *skb)
skb_set_network_header(skb, nh);
}
- ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INMSGS);
if (skb_checksum_simple_validate(skb))
goto csum_error;
@@ -1006,7 +1006,7 @@ int icmp_rcv(struct sk_buff *skb)
icmph = icmp_hdr(skb);
- ICMPMSGIN_INC_STATS_BH(net, icmph->type);
+ ICMPMSGIN_INC_STATS(net, icmph->type);
/*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
*
@@ -1052,9 +1052,9 @@ drop:
kfree_skb(skb);
return 0;
csum_error:
- ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
goto drop;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bc5196ea1..fa8c39804 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -427,7 +427,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
route_err:
ip_rt_put(rt);
no_route:
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
@@ -466,7 +466,7 @@ route_err:
ip_rt_put(rt);
no_route:
rcu_read_unlock();
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
@@ -661,6 +661,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ /* listeners have SOCK_RCU_FREE, not the children */
+ sock_reset_flag(newsk, SOCK_RCU_FREE);
+
newsk->sk_mark = inet_rsk(req)->ir_mark;
atomic64_set(&newsk->sk_cookie,
atomic64_read(&inet_rsk(req)->ir_cookie));
@@ -703,7 +706,9 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
+ local_bh_disable();
percpu_counter_dec(sk->sk_prot->orphan_count);
+ local_bh_enable();
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5fdb02f55..25af12436 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -66,7 +66,7 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
mutex_unlock(&inet_diag_table_mutex);
}
-static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
{
r->idiag_family = sk->sk_family;
@@ -89,6 +89,7 @@ static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
r->id.idiag_dst[0] = sk->sk_daddr;
}
}
+EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
static size_t inet_sk_attr_size(void)
{
@@ -104,13 +105,50 @@ static size_t inet_sk_attr_size(void)
+ 64;
}
+int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct inet_diag_msg *r, int ext,
+ struct user_namespace *user_ns)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
+
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (r->idiag_family == AF_INET6) {
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
+
+ if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+ goto errout;
+ }
+#endif
+
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
+
+ return 0;
+errout:
+ return 1;
+}
+EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
+
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- const struct inet_sock *inet = inet_sk(sk);
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
int ext = req->idiag_ext;
@@ -135,32 +173,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
goto errout;
- /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
- * hence this needs to be included regardless of socket family.
- */
- if (ext & (1 << (INET_DIAG_TOS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
- goto errout;
-
-#if IS_ENABLED(CONFIG_IPV6)
- if (r->idiag_family == AF_INET6) {
- if (ext & (1 << (INET_DIAG_TCLASS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TCLASS,
- inet6_sk(sk)->tclass) < 0)
- goto errout;
-
- if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
- nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
- goto errout;
- }
-#endif
-
- r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
- r->idiag_inode = sock_i_ino(sk);
-
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
@@ -182,31 +197,32 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto out;
}
-#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
-
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
- r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ r->idiag_expires =
+ jiffies_to_msecs(icsk->icsk_timeout - jiffies);
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = icsk->icsk_probes_out;
- r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ r->idiag_expires =
+ jiffies_to_msecs(icsk->icsk_timeout - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = icsk->icsk_probes_out;
- r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ r->idiag_expires =
+ jiffies_to_msecs(sk->sk_timer.expires - jiffies);
} else {
r->idiag_timer = 0;
r->idiag_expires = 0;
}
-#undef EXPIRES_IN_MS
if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
- attr = nla_reserve(skb, INET_DIAG_INFO,
- handler->idiag_info_size);
+ attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
+ handler->idiag_info_size,
+ INET_DIAG_PAD);
if (!attr)
goto errout;
@@ -356,6 +372,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
{
struct sock *sk;
+ rcu_read_lock();
if (req->sdiag_family == AF_INET)
sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
@@ -376,9 +393,11 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
req->id.idiag_if);
}
#endif
- else
+ else {
+ rcu_read_unlock();
return ERR_PTR(-EINVAL);
-
+ }
+ rcu_read_unlock();
if (!sk)
return ERR_PTR(-ENOENT);
@@ -772,13 +791,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock_bh(&ilb->lock);
- sk_nulls_for_each(sk, node, &ilb->head) {
+ sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
@@ -1061,7 +1079,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
attr = handler->idiag_info_size
- ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size)
+ ? nla_reserve_64bit(skb, INET_DIAG_INFO,
+ handler->idiag_info_size,
+ INET_DIAG_PAD)
: NULL;
if (attr)
info = nla_data(attr);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 0d9e9d7bb..77c20a489 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
/*
- * Don't inline this cruft. Here are some nice properties to exploit here. The
- * BSD API does not allow a listening sock to specify the remote port nor the
+ * Here are some nice properties to exploit here. The BSD API
+ * does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/
-
+/* called with rcu_read_lock() : No refcount taken on the socket */
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -212,38 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore, matches = 0, reuseport = 0;
- bool select_ok = true;
+ int score, hiscore = 0, matches = 0, reuseport = 0;
+ struct sock *sk, *result = NULL;
u32 phash = 0;
- rcu_read_lock();
-begin:
- result = NULL;
- hiscore = 0;
- sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+ sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
- result = sk;
- hiscore = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
- sk2 = reuseport_select_sock(sk, phash,
- skb, doff);
- if (sk2) {
- result = sk2;
- goto found;
- }
- }
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ hiscore = score;
} else if (score == hiscore && reuseport) {
matches++;
if (reciprocal_scale(phash, matches) == 0)
@@ -251,25 +240,6 @@ begin:
phash = next_pseudo_random32(phash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
- result = NULL;
- else if (unlikely(compute_score(result, net, hnum, daddr,
- dif) < hiscore)) {
- sock_put(result);
- select_ok = false;
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -312,7 +282,6 @@ struct sock *__inet_lookup_established(struct net *net,
unsigned int slot = hash & hashinfo->ehash_mask;
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
- rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
@@ -339,7 +308,6 @@ begin:
out:
sk = NULL;
found:
- rcu_read_unlock();
return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);
@@ -392,7 +360,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
sk_nulls_del_node_init_rcu((struct sock *)tw);
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -472,10 +440,9 @@ static int inet_reuseport_add_sock(struct sock *sk,
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ sk_for_each_rcu(sk2, &ilb->head) {
if (sk2 != sk &&
sk2->sk_family == sk->sk_family &&
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
@@ -514,7 +481,12 @@ int __inet_hash(struct sock *sk, struct sock *osk,
if (err)
goto unlock;
}
- __sk_nulls_add_node_rcu(sk, &ilb->head);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+ sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
+ else
+ hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb->lock);
@@ -541,20 +513,25 @@ void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
spinlock_t *lock;
+ bool listener = false;
int done;
if (sk_unhashed(sk))
return;
- if (sk->sk_state == TCP_LISTEN)
+ if (sk->sk_state == TCP_LISTEN) {
lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
- else
+ listener = true;
+ } else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-
+ }
spin_lock_bh(lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- done = __sk_nulls_del_node_init_rcu(sk);
+ if (listener)
+ done = __sk_del_node_init(sk);
+ else
+ done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
@@ -690,9 +667,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
- i + LISTENING_NULLS_BASE);
- }
+ INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ }
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c67f9bd76..206581674 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -94,7 +94,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
}
/*
- * Enter the time wait state. This is called with locally disabled BH.
+ * Enter the time wait state.
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
*/
@@ -112,7 +112,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
- spin_lock(&bhead->lock);
+ spin_lock_bh(&bhead->lock);
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
@@ -138,7 +138,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- spin_unlock(lock);
+ spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -147,9 +147,9 @@ static void tw_timer_handler(unsigned long data)
struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
if (tw->tw_kill)
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
else
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index af18f1e48..cbfb1808f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -65,8 +65,8 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len);
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -157,7 +157,7 @@ sr_failed:
too_many_hops:
/* Tell the sender its packet died... */
- IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index efbd47d1a..bbe7f72db 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -204,14 +204,14 @@ static void ip_expire(unsigned long arg)
goto out;
ipq_kill(qp);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
if (!inet_frag_evicting(&qp->q)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
goto out;
@@ -291,7 +291,7 @@ static int ip_frag_too_far(struct ipq *qp)
struct net *net;
net = container_of(qp->q.net, struct net, ipv4.frags);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
}
return rc;
@@ -635,7 +635,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
ip_send_check(iph);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
return 0;
@@ -647,7 +647,7 @@ out_nomem:
out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
return err;
}
@@ -658,7 +658,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
skb_orphan(skb);
/* Lookup (or create) queue header */
@@ -675,7 +675,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
return ret;
}
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4cc84212c..1d000af7f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -49,12 +49,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/ip6_fib.h>
-#include <net/ip6_route.h>
-#endif
-
/*
Problems & solutions
--------------------
@@ -122,126 +116,6 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
-static int ip_gre_calc_hlen(__be16 o_flags)
-{
- int addend = 4;
-
- if (o_flags & TUNNEL_CSUM)
- addend += 4;
- if (o_flags & TUNNEL_KEY)
- addend += 4;
- if (o_flags & TUNNEL_SEQ)
- addend += 4;
- return addend;
-}
-
-static __be16 gre_flags_to_tnl_flags(__be16 flags)
-{
- __be16 tflags = 0;
-
- if (flags & GRE_CSUM)
- tflags |= TUNNEL_CSUM;
- if (flags & GRE_ROUTING)
- tflags |= TUNNEL_ROUTING;
- if (flags & GRE_KEY)
- tflags |= TUNNEL_KEY;
- if (flags & GRE_SEQ)
- tflags |= TUNNEL_SEQ;
- if (flags & GRE_STRICT)
- tflags |= TUNNEL_STRICT;
- if (flags & GRE_REC)
- tflags |= TUNNEL_REC;
- if (flags & GRE_VERSION)
- tflags |= TUNNEL_VERSION;
-
- return tflags;
-}
-
-static __be16 tnl_flags_to_gre_flags(__be16 tflags)
-{
- __be16 flags = 0;
-
- if (tflags & TUNNEL_CSUM)
- flags |= GRE_CSUM;
- if (tflags & TUNNEL_ROUTING)
- flags |= GRE_ROUTING;
- if (tflags & TUNNEL_KEY)
- flags |= GRE_KEY;
- if (tflags & TUNNEL_SEQ)
- flags |= GRE_SEQ;
- if (tflags & TUNNEL_STRICT)
- flags |= GRE_STRICT;
- if (tflags & TUNNEL_REC)
- flags |= GRE_REC;
- if (tflags & TUNNEL_VERSION)
- flags |= GRE_VERSION;
-
- return flags;
-}
-
-/* Fills in tpi and returns header length to be pulled. */
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err)
-{
- const struct gre_base_hdr *greh;
- __be32 *options;
- int hdr_len;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
- return -EINVAL;
-
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
- hdr_len = ip_gre_calc_hlen(tpi->flags);
-
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- tpi->proto = greh->protocol;
-
- options = (__be32 *)(greh + 1);
- if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
- *csum_err = true;
- return -EINVAL;
- }
-
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
- options++;
- }
-
- if (greh->flags & GRE_KEY) {
- tpi->key = *options;
- options++;
- } else {
- tpi->key = 0;
- }
- if (unlikely(greh->flags & GRE_SEQ)) {
- tpi->seq = *options;
- options++;
- } else {
- tpi->seq = 0;
- }
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
- tpi->proto = htons(ETH_P_IP);
- if ((*(u8 *)options & 0xF0) != 0x40) {
- hdr_len += 4;
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
- }
- }
- return hdr_len;
-}
-
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
{
@@ -337,12 +211,14 @@ static void gre_err(struct sk_buff *skb, u32 info)
* by themselves???
*/
+ const struct iphdr *iph = (struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
- if (parse_gre_header(skb, &tpi, &csum_err) < 0) {
+ if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
+ iph->ihl * 4) < 0) {
if (!csum_err) /* ignore csum errors. */
return;
}
@@ -380,24 +256,22 @@ static __be32 tunnel_id_to_key(__be64 x)
#endif
}
-static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
- struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
- struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
iph = ip_hdr(skb);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
+ raw_proto, false) < 0)
+ goto drop;
+
if (tunnel->dev->type != ARPHRD_NONE)
skb_pop_mac_header(skb);
else
@@ -416,7 +290,34 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
- return PACKET_REJECT;
+ return PACKET_NEXT;
+
+drop:
+ kfree_skb(skb);
+ return PACKET_RCVD;
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ int res;
+
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
+ if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
+ /* ipgre tunnels in collect metadata mode should receive
+ * also ETH_P_TEB traffic.
+ */
+ itn = net_generic(net, ipgre_net_id);
+ res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
+ }
+ return res;
}
static int gre_rcv(struct sk_buff *skb)
@@ -433,13 +334,11 @@ static int gre_rcv(struct sk_buff *skb)
}
#endif
- hdr_len = parse_gre_header(skb, &tpi, &csum_err);
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
if (hdr_len < 0)
goto drop;
- if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0)
- goto drop;
- if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
+ if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -448,49 +347,6 @@ drop:
return 0;
}
-static __sum16 gre_checksum(struct sk_buff *skb)
-{
- __wsum csum;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- csum = lco_csum(skb);
- else
- csum = skb_checksum(skb, 0, skb->len, 0);
- return csum_fold(csum);
-}
-
-static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
- __be16 proto, __be32 key, __be32 seq)
-{
- struct gre_base_hdr *greh;
-
- skb_push(skb, hdr_len);
-
- skb_reset_transport_header(skb);
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(flags);
- greh->protocol = proto;
-
- if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (flags & TUNNEL_SEQ) {
- *ptr = seq;
- ptr--;
- }
- if (flags & TUNNEL_KEY) {
- *ptr = key;
- ptr--;
- }
- if (flags & TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type &
- (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
- *ptr = 0;
- *(__sum16 *)ptr = gre_checksum(skb);
- }
- }
-}
-
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params,
__be16 proto)
@@ -501,15 +357,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->o_seqno++;
/* Push GRE header. */
- build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
- proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+ gre_build_header(skb, tunnel->tun_hlen,
+ tunnel->parms.o_flags, proto, tunnel->parms.o_key,
+ htonl(tunnel->o_seqno));
skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
-static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
- bool csum)
+static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
@@ -562,7 +418,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
fl.saddr);
}
- tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+ tunnel_hlen = gre_calc_hlen(key->tun_flags);
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ tunnel_hlen + sizeof(struct iphdr);
@@ -577,15 +433,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
}
/* Push Tunnel header. */
- skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
- if (IS_ERR(skb)) {
- skb = NULL;
+ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
goto err_free_rt;
- }
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
- build_header(skb, tunnel_hlen, flags, proto,
- tunnel_id_to_key(tun_info->key.tun_id), 0);
+ gre_build_header(skb, tunnel_hlen, flags, proto,
+ tunnel_id_to_key(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
@@ -649,16 +502,14 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
tnl_params = &tunnel->parms.iph;
}
- skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
- if (IS_ERR(skb))
- goto out;
+ if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ goto free_skb;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
return NETDEV_TX_OK;
free_skb:
kfree_skb(skb);
-out:
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
@@ -673,9 +524,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
}
- skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
- if (IS_ERR(skb))
- goto out;
+ if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ goto free_skb;
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
@@ -685,7 +535,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
-out:
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
@@ -711,8 +560,8 @@ static int ipgre_tunnel_ioctl(struct net_device *dev,
if (err)
return err;
- p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
- p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+ p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
return -EFAULT;
@@ -756,7 +605,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
greh = (struct gre_base_hdr *)(iph+1);
- greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
greh->protocol = htons(type);
memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
@@ -857,7 +706,7 @@ static void __gre_tunnel_init(struct net_device *dev)
int t_hlen;
tunnel = netdev_priv(dev);
- tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
@@ -1180,8 +1029,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel_parm *p = &t->parms;
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
- nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
- nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS,
+ gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS,
+ gre_tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1266,6 +1117,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
{
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
+ LIST_HEAD(list_kill);
struct ip_tunnel *t;
int err;
@@ -1281,8 +1133,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t->collect_md = true;
err = ipgre_newlink(net, dev, tb, NULL);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
/* openvswitch users expect packet sizes to be unrestricted,
* so set the largest MTU we can.
@@ -1291,9 +1145,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
if (err)
goto out;
+ err = rtnl_configure_link(dev, NULL);
+ if (err < 0)
+ goto out;
+
return dev;
out:
- free_netdev(dev);
+ ip_tunnel_dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e3d782746..4b351af3e 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -218,17 +218,17 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b
protocol = -ret;
goto resubmit;
}
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
@@ -273,7 +273,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
--ANK (980813)
*/
if (skb_cow(skb, skb_headroom(skb))) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -282,7 +282,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
if (ip_options_compile(dev_net(dev), opt, skb)) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
goto drop;
}
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -337,7 +344,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
- NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER);
+ __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
}
@@ -358,9 +365,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST) {
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
@@ -409,11 +416,11 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
net = dev_net(dev);
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
@@ -439,9 +446,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
- IP_ADD_STATS_BH(net,
- IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
- max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+ __IP_ADD_STATS(net,
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
@@ -453,7 +460,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
@@ -463,7 +470,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
@@ -480,9 +488,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
ip_rcv_finish);
csum_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 124bf0a66..4bd492163 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -271,7 +271,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return dst_output(net, sk, skb);
}
#endif
- mtu = ip_skb_dst_mtu(skb);
+ mtu = ip_skb_dst_mtu(sk, skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
@@ -541,7 +541,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
- mtu = ip_skb_dst_mtu(skb);
+ mtu = ip_skb_dst_mtu(sk, skb);
if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
mtu = IPCB(skb)->frag_max_size;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 035ad645a..71a52f4d4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -106,7 +106,8 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
return;
if (offset != 0)
- csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
+ csum = csum_sub(csum, csum_partial(skb_transport_header(skb),
+ offset, 0));
put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
}
@@ -219,11 +220,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
-int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
bool allow_ipv6)
{
int err, val;
struct cmsghdr *cmsg;
+ struct net *net = sock_net(sk);
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
@@ -244,6 +246,13 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
continue;
}
#endif
+ if (cmsg->cmsg_level == SOL_SOCKET) {
+ err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc);
+ if (err)
+ return err;
+ continue;
+ }
+
if (cmsg->cmsg_level != SOL_IP)
continue;
switch (cmsg->cmsg_type) {
@@ -502,9 +511,10 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
copied = len;
}
err = skb_copy_datagram_msg(skb, 0, msg, copied);
- if (err)
- goto out_free_skb;
-
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
sock_recv_timestamp(msg, sk, skb);
serr = SKB_EXT_ERR(skb);
@@ -536,8 +546,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
-out_free_skb:
- kfree_skb(skb);
+ consume_skb(skb);
out:
return err;
}
@@ -635,7 +644,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (err)
break;
old = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet->is_icsk) {
struct inet_connection_sock *icsk = inet_csk(sk);
#if IS_ENABLED(CONFIG_IPV6)
@@ -1185,7 +1194,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
ipv6_sk_rxinfo(sk);
if (prepare && skb_rtable(skb)) {
- pktinfo->ipi_ifindex = inet_iif(skb);
+ /* skb->cb is overloaded: prior to this point it is IP{6}CB
+ * which has interface index (iif) as the first member of the
+ * underlying inet{6}_skb_parm struct. This code then overlays
+ * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
+ * element so the iif is picked up from the prior IPCB
+ */
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
@@ -1295,7 +1309,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
struct ip_options_rcu *inet_opt;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
opt->optlen = 0;
if (inet_opt)
memcpy(optbuf, &inet_opt->opt,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a69ed94bd..d8f5e0a26 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -443,29 +443,6 @@ drop:
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
-static int ip_encap_hlen(struct ip_tunnel_encap *e)
-{
- const struct ip_tunnel_encap_ops *ops;
- int hlen = -EINVAL;
-
- if (e->type == TUNNEL_ENCAP_NONE)
- return 0;
-
- if (e->type >= MAX_IPTUN_ENCAP_OPS)
- return -EINVAL;
-
- rcu_read_lock();
- ops = rcu_dereference(iptun_encaps[e->type]);
- if (likely(ops && ops->encap_hlen))
- hlen = ops->encap_hlen(e);
- rcu_read_unlock();
-
- return hlen;
-}
-
-const struct ip_tunnel_encap_ops __rcu *
- iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
-
int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
unsigned int num)
{
@@ -519,28 +496,6 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t,
}
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
-int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
- u8 *protocol, struct flowi4 *fl4)
-{
- const struct ip_tunnel_encap_ops *ops;
- int ret = -EINVAL;
-
- if (t->encap.type == TUNNEL_ENCAP_NONE)
- return 0;
-
- if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
- return -EINVAL;
-
- rcu_read_lock();
- ops = rcu_dereference(iptun_encaps[t->encap.type]);
- if (likely(ops && ops->build_header))
- ret = ops->build_header(skb, &t->encap, protocol, fl4);
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL(ip_tunnel_encap);
-
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df,
const struct iphdr *inner_iph)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6165f30c4..afd6b5968 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -37,6 +37,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -47,6 +48,14 @@
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(iptun_encaps);
+
+const struct ip6_tnl_encap_ops __rcu *
+ ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(ip6tun_encaps);
+
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
@@ -86,15 +95,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
- bool xnet)
+int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
+ __be16 inner_proto, bool raw_proto, bool xnet)
{
if (unlikely(!pskb_may_pull(skb, hdr_len)))
return -ENOMEM;
skb_pull_rcsum(skb, hdr_len);
- if (inner_proto == htons(ETH_P_TEB)) {
+ if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
struct ethhdr *eh;
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
@@ -117,7 +126,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
return iptunnel_pull_offloads(skb);
}
-EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags)
@@ -146,8 +155,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
}
EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
-struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
- int gso_type_mask)
+int iptunnel_handle_offloads(struct sk_buff *skb,
+ int gso_type_mask)
{
int err;
@@ -157,11 +166,11 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
}
if (skb_is_gso(skb)) {
- err = skb_unclone(skb, GFP_ATOMIC);
+ err = skb_header_unclone(skb, GFP_ATOMIC);
if (unlikely(err))
- goto error;
+ return err;
skb_shinfo(skb)->gso_type |= gso_type_mask;
- return skb;
+ return 0;
}
if (skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -174,10 +183,7 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
skb->encapsulation = 0;
}
- return skb;
-error:
- kfree_skb(skb);
- return ERR_PTR(err);
+ return 0;
}
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
@@ -247,10 +253,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
if (tb[LWTUNNEL_IP_DST])
- tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
+ tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
if (tb[LWTUNNEL_IP_SRC])
- tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
+ tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
if (tb[LWTUNNEL_IP_TTL])
tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
@@ -274,9 +280,10 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
- nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
- nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
+ LWTUNNEL_IP_PAD) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
@@ -287,7 +294,7 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
- return nla_total_size(8) /* LWTUNNEL_IP_ID */
+ return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
+ nla_total_size(4) /* LWTUNNEL_IP_DST */
+ nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP_TOS */
@@ -369,7 +376,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
+ LWTUNNEL_IP6_PAD) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
@@ -382,7 +390,7 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
- return nla_total_size(8) /* LWTUNNEL_IP6_ID */
+ return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */
+ nla_total_size(16) /* LWTUNNEL_IP6_DST */
+ nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 2ed9dd2b5..1d71c40ea 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -127,7 +127,9 @@ __be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
-__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#ifdef IPCONFIG_DYNAMIC
+static __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#endif
__be32 ic_servaddr = NONE; /* Boot server IP address */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec51d0216..978370132 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,9 +219,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
- if (IS_ERR(skb))
- goto out;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
+ goto tx_error;
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
@@ -230,7 +229,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
tx_error:
kfree_skb(skb);
-out:
+
dev->stats.tx_errors++;
return NETDEV_TX_OK;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a42dd8021..5ad48ec77 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2106,7 +2106,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
- if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0)
return -EMSGSIZE;
rtm->rtm_type = RTN_MULTICAST;
@@ -2239,7 +2239,7 @@ static size_t mroute_msgsize(bool unresolved, int maxvif)
+ nla_total_size(0) /* RTA_MULTIPATH */
+ maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
/* RTA_MFC_STATS */
- + nla_total_size(sizeof(struct rta_mfc_stats))
+ + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
;
return len;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 85d60c69b..2033f929a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -34,27 +34,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
MODULE_DESCRIPTION("arptables core");
-/*#define DEBUG_ARP_TABLES*/
-/*#define DEBUG_ARP_TABLES_USER*/
-
-#ifdef DEBUG_ARP_TABLES
-#define dprintf(format, args...) pr_debug(format, ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_ARP_TABLES_USER
-#define duprintf(format, args...) pr_debug(format, ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-#ifdef CONFIG_NETFILTER_DEBUG
-#define ARP_NF_ASSERT(x) WARN_ON(!(x))
-#else
-#define ARP_NF_ASSERT(x)
-#endif
-
void *arpt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(arpt, ARPT);
@@ -113,36 +92,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
- ARPT_INV_ARPOP)) {
- dprintf("ARP operation field mismatch.\n");
- dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
- arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ ARPT_INV_ARPOP))
return 0;
- }
if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
- ARPT_INV_ARPHRD)) {
- dprintf("ARP hardware address format mismatch.\n");
- dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
- arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ ARPT_INV_ARPHRD))
return 0;
- }
if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
- ARPT_INV_ARPPRO)) {
- dprintf("ARP protocol address format mismatch.\n");
- dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
- arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ ARPT_INV_ARPPRO))
return 0;
- }
if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
- ARPT_INV_ARPHLN)) {
- dprintf("ARP hardware address length mismatch.\n");
- dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
- arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ ARPT_INV_ARPHLN))
return 0;
- }
src_devaddr = arpptr;
arpptr += dev->addr_len;
@@ -155,49 +118,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
ARPT_INV_SRCDEVADDR) ||
FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
- ARPT_INV_TGTDEVADDR)) {
- dprintf("Source or target device address mismatch.\n");
-
+ ARPT_INV_TGTDEVADDR))
return 0;
- }
if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
ARPT_INV_SRCIP) ||
FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
- ARPT_INV_TGTIP)) {
- dprintf("Source or target IP address mismatch.\n");
-
- dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
- &src_ipaddr,
- &arpinfo->smsk.s_addr,
- &arpinfo->src.s_addr,
- arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
- dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n",
- &tgt_ipaddr,
- &arpinfo->tmsk.s_addr,
- &arpinfo->tgt.s_addr,
- arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ ARPT_INV_TGTIP))
return 0;
- }
/* Look for ifname matches. */
ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, arpinfo->iniface,
- arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : "");
+ if (FWINV(ret != 0, ARPT_INV_VIA_IN))
return 0;
- }
ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, arpinfo->outiface,
- arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : "");
+ if (FWINV(ret != 0, ARPT_INV_VIA_OUT))
return 0;
- }
return 1;
#undef FWINV
@@ -205,16 +144,10 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
static inline int arp_checkentry(const struct arpt_arp *arp)
{
- if (arp->flags & ~ARPT_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- arp->flags & ~ARPT_F_MASK);
+ if (arp->flags & ~ARPT_F_MASK)
return 0;
- }
- if (arp->invflags & ~ARPT_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- arp->invflags & ~ARPT_INV_MASK);
+ if (arp->invflags & ~ARPT_INV_MASK)
return 0;
- }
return 1;
}
@@ -406,11 +339,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
= (void *)arpt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
- pr_notice("arptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS))
return 0;
- }
+
e->comefrom
|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
@@ -423,12 +354,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
if ((strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("mark_source_chains: bad "
- "negative verdict (%i)\n",
- t->verdict);
+ t->verdict < -NF_MAX_VERDICT - 1)
return 0;
- }
/* Return: backtrack through the last
* big jump.
@@ -461,17 +388,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
if (strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
- if (newpos > newinfo->size -
- sizeof(struct arpt_entry)) {
- duprintf("mark_source_chains: "
- "bad verdict (%i)\n",
- newpos);
- return 0;
- }
-
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
e = (struct arpt_entry *)
(entry0 + newpos);
if (!find_jump_target(newinfo, e))
@@ -488,8 +405,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
-next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
@@ -497,7 +413,6 @@ next:
static inline int check_target(struct arpt_entry *e, const char *name)
{
struct xt_entry_target *t = arpt_get_target(e);
- int ret;
struct xt_tgchk_param par = {
.table = name,
.entryinfo = e,
@@ -507,13 +422,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
.family = NFPROTO_ARP,
};
- ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
- if (ret < 0) {
- duprintf("arp_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- return ret;
- }
- return 0;
+ return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
}
static inline int
@@ -521,17 +430,18 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
{
struct xt_entry_target *t;
struct xt_target *target;
+ unsigned long pcnt;
int ret;
- e->counters.pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(e->counters.pcnt))
+ pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(pcnt))
return -ENOMEM;
+ e->counters.pcnt = pcnt;
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
ret = PTR_ERR(target);
goto out;
}
@@ -577,17 +487,12 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
(unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p\n", e);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
if (!arp_checkentry(&e->arp))
return -EINVAL;
@@ -604,12 +509,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
- if (!check_underflow(e)) {
- pr_debug("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ if (!check_underflow(e))
return -EINVAL;
- }
+
newinfo->underflow[h] = underflows[h];
}
}
@@ -654,7 +556,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
@@ -671,31 +572,21 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
- duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
return ret;
- if (i != repl->num_entries) {
- duprintf("translate_table: %u not %u entries\n",
- i, repl->num_entries);
+ if (i != repl->num_entries)
return -EINVAL;
- }
/* Check hooks all assigned */
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(repl->valid_hooks & (1 << i)))
continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, repl->hook_entry[i]);
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF)
return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, repl->underflow[i]);
+ if (newinfo->underflow[i] == 0xFFFFFFFF)
return -EINVAL;
- }
}
if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
@@ -903,11 +794,8 @@ static int get_info(struct net *net, void __user *user,
struct xt_table *t;
int ret;
- if (*len != sizeof(struct arpt_getinfo)) {
- duprintf("length %u != %Zu\n", *len,
- sizeof(struct arpt_getinfo));
+ if (*len != sizeof(struct arpt_getinfo))
return -EINVAL;
- }
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
@@ -963,33 +851,25 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
struct arpt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct arpt_get_entries) + get.size) {
- duprintf("get_entries: %u != %Zu\n", *len,
- sizeof(struct arpt_get_entries) + get.size);
+ if (*len != sizeof(struct arpt_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n",
- private->number);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else
ret = -EAGAIN;
- }
+
module_put(t->me);
xt_table_unlock(t);
} else
@@ -1027,8 +907,6 @@ static int __do_replace(struct net *net, const char *name,
/* You lied! */
if (valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
@@ -1038,8 +916,6 @@ static int __do_replace(struct net *net, const char *name,
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
if ((oldinfo->number > oldinfo->initial_entries) ||
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
@@ -1109,8 +985,6 @@ static int do_replace(struct net *net, const void __user *user,
if (ret != 0)
goto free_newinfo;
- duprintf("arp_tables: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
@@ -1208,20 +1082,14 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
unsigned int entry_offset;
int ret, off;
- duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
(unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p, limit = %p\n", e, limit);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset < sizeof(struct compat_arpt_entry) +
- sizeof(struct compat_xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ sizeof(struct compat_xt_entry_target))
return -EINVAL;
- }
if (!arp_checkentry(&e->arp))
return -EINVAL;
@@ -1238,8 +1106,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
- t->u.user.name);
ret = PTR_ERR(target);
goto out;
}
@@ -1309,7 +1175,6 @@ static int translate_compat_table(struct xt_table_info **pinfo,
size = compatr->size;
info->number = compatr->num_entries;
- duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(NFPROTO_ARP);
xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries);
@@ -1324,11 +1189,8 @@ static int translate_compat_table(struct xt_table_info **pinfo,
}
ret = -EINVAL;
- if (j != compatr->num_entries) {
- duprintf("translate_compat_table: %u not %u entries\n",
- j, compatr->num_entries);
+ if (j != compatr->num_entries)
goto out_unlock;
- }
ret = -ENOMEM;
newinfo = xt_alloc_table_info(size);
@@ -1398,8 +1260,6 @@ static int compat_do_replace(struct net *net, void __user *user,
return -EFAULT;
/* overflow check */
- if (tmp.size >= INT_MAX / num_possible_cpus())
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
@@ -1421,8 +1281,6 @@ static int compat_do_replace(struct net *net, void __user *user,
if (ret != 0)
goto free_newinfo;
- duprintf("compat_do_replace: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
@@ -1455,7 +1313,6 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
break;
default:
- duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1538,17 +1395,13 @@ static int compat_get_entries(struct net *net,
struct compat_arpt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct compat_arpt_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct compat_arpt_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(NFPROTO_ARP);
@@ -1557,16 +1410,13 @@ static int compat_get_entries(struct net *net,
const struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n", private->number);
ret = compat_table_info(private, &info);
if (!ret && get.size == info.size) {
ret = compat_copy_entries_to_user(private->size,
t, uptr->entrytable);
- } else if (!ret) {
- duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ } else if (!ret)
ret = -EAGAIN;
- }
+
xt_compat_flush_offsets(NFPROTO_ARP);
module_put(t->me);
xt_table_unlock(t);
@@ -1618,7 +1468,6 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
break;
default:
- duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1661,7 +1510,6 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
}
default:
- duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1706,7 +1554,6 @@ int arpt_register_table(struct net *net,
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(newinfo, loc_cpu_entry, repl);
- duprintf("arpt_register_table: translate table gives %d\n", ret);
if (ret != 0)
goto out_free;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 0984ea3fc..54906e0e8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -35,34 +35,12 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");
-/*#define DEBUG_IP_FIREWALL*/
-/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
-
-#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) pr_info(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) pr_info(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
#ifdef CONFIG_NETFILTER_DEBUG
#define IP_NF_ASSERT(x) WARN_ON(!(x))
#else
#define IP_NF_ASSERT(x)
#endif
-#if 0
-/* All the better to debug you with... */
-#define static
-#define inline
-#endif
-
void *ipt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(ipt, IPT);
@@ -85,52 +63,28 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
IPT_INV_SRCIP) ||
FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
- IPT_INV_DSTIP)) {
- dprintf("Source or dest mismatch.\n");
-
- dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
- &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr,
- ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
- dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n",
- &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr,
- ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+ IPT_INV_DSTIP))
return false;
- }
ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, ipinfo->iniface,
- ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : "");
+ if (FWINV(ret != 0, IPT_INV_VIA_IN))
return false;
- }
ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, ipinfo->outiface,
- ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : "");
+ if (FWINV(ret != 0, IPT_INV_VIA_OUT))
return false;
- }
/* Check specific protocol */
if (ipinfo->proto &&
- FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
- dprintf("Packet protocol %hi does not match %hi.%s\n",
- ip->protocol, ipinfo->proto,
- ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : "");
+ FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO))
return false;
- }
/* If we have a fragment rule but the packet is not a fragment
* then we return zero */
- if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
- dprintf("Fragment rule but not fragment.%s\n",
- ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+ if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG))
return false;
- }
return true;
}
@@ -138,16 +92,10 @@ ip_packet_match(const struct iphdr *ip,
static bool
ip_checkentry(const struct ipt_ip *ip)
{
- if (ip->flags & ~IPT_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- ip->flags & ~IPT_F_MASK);
+ if (ip->flags & ~IPT_F_MASK)
return false;
- }
- if (ip->invflags & ~IPT_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- ip->invflags & ~IPT_INV_MASK);
+ if (ip->invflags & ~IPT_INV_MASK)
return false;
- }
return true;
}
@@ -346,10 +294,6 @@ ipt_do_table(struct sk_buff *skb,
e = get_entry(table_base, private->hook_entry[hook]);
- pr_debug("Entering %s(hook %u), UF %p\n",
- table->name, hook,
- get_entry(table_base, private->underflow[hook]));
-
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
@@ -396,22 +340,15 @@ ipt_do_table(struct sk_buff *skb,
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
- pr_debug("Underflow (this is normal) "
- "to %p\n", e);
} else {
e = jumpstack[--stackidx];
- pr_debug("Pulled %p out from pos %u\n",
- e, stackidx);
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
- !(e->ip.flags & IPT_F_GOTO)) {
+ !(e->ip.flags & IPT_F_GOTO))
jumpstack[stackidx++] = e;
- pr_debug("Pushed %p into pos %u\n",
- e, stackidx - 1);
- }
e = get_entry(table_base, v);
continue;
@@ -429,18 +366,13 @@ ipt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
xt_write_recseq_end(addend);
local_bh_enable();
-#ifdef DEBUG_ALLOW_ALL
- return NF_ACCEPT;
-#else
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
-#endif
}
static bool find_jump_target(const struct xt_table_info *t,
@@ -480,11 +412,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
= (void *)ipt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
- pr_err("iptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS))
return 0;
- }
+
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
@@ -496,26 +426,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
if ((strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("mark_source_chains: bad "
- "negative verdict (%i)\n",
- t->verdict);
+ t->verdict < -NF_MAX_VERDICT - 1)
return 0;
- }
/* Return: backtrack through the last
big jump. */
do {
e->comefrom ^= (1<<NF_INET_NUMHOOKS);
-#ifdef DEBUG_IP_FIREWALL_USER
- if (e->comefrom
- & (1 << NF_INET_NUMHOOKS)) {
- duprintf("Back unset "
- "on hook %u "
- "rule %u\n",
- hook, pos);
- }
-#endif
oldpos = pos;
pos = e->counters.pcnt;
e->counters.pcnt = 0;
@@ -542,16 +459,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
if (strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
- if (newpos > newinfo->size -
- sizeof(struct ipt_entry)) {
- duprintf("mark_source_chains: "
- "bad verdict (%i)\n",
- newpos);
- return 0;
- }
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
e = (struct ipt_entry *)
(entry0 + newpos);
if (!find_jump_target(newinfo, e))
@@ -568,8 +476,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
-next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
@@ -591,18 +498,12 @@ static int
check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ipt_ip *ip = par->entryinfo;
- int ret;
par->match = m->u.kernel.match;
par->matchinfo = m->data;
- ret = xt_check_match(par, m->u.match_size - sizeof(*m),
- ip->proto, ip->invflags & IPT_INV_PROTO);
- if (ret < 0) {
- duprintf("check failed for `%s'.\n", par->match->name);
- return ret;
- }
- return 0;
+ return xt_check_match(par, m->u.match_size - sizeof(*m),
+ ip->proto, ip->invflags & IPT_INV_PROTO);
}
static int
@@ -613,10 +514,8 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
m->u.user.revision);
- if (IS_ERR(match)) {
- duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+ if (IS_ERR(match))
return PTR_ERR(match);
- }
m->u.kernel.match = match;
ret = check_match(m, par);
@@ -641,16 +540,9 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
.hook_mask = e->comefrom,
.family = NFPROTO_IPV4,
};
- int ret;
- ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
- e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
- if (ret < 0) {
- duprintf("check failed for `%s'.\n",
- t->u.kernel.target->name);
- return ret;
- }
- return 0;
+ return xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
}
static int
@@ -663,10 +555,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
+ unsigned long pcnt;
- e->counters.pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(e->counters.pcnt))
+ pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(pcnt))
return -ENOMEM;
+ e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -685,7 +579,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
ret = PTR_ERR(target);
goto cleanup_matches;
}
@@ -739,17 +632,12 @@ check_entry_size_and_hooks(struct ipt_entry *e,
if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
(unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p\n", e);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
if (!ip_checkentry(&e->ip))
return -EINVAL;
@@ -766,12 +654,9 @@ check_entry_size_and_hooks(struct ipt_entry *e,
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
- if (!check_underflow(e)) {
- pr_debug("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ if (!check_underflow(e))
return -EINVAL;
- }
+
newinfo->underflow[h] = underflows[h];
}
}
@@ -823,7 +708,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter, entry0, newinfo->size) {
@@ -840,27 +724,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
++newinfo->stacksize;
}
- if (i != repl->num_entries) {
- duprintf("translate_table: %u not %u entries\n",
- i, repl->num_entries);
+ if (i != repl->num_entries)
return -EINVAL;
- }
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(repl->valid_hooks & (1 << i)))
continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, repl->hook_entry[i]);
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF)
return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, repl->underflow[i]);
+ if (newinfo->underflow[i] == 0xFFFFFFFF)
return -EINVAL;
- }
}
if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
@@ -1088,11 +963,8 @@ static int get_info(struct net *net, void __user *user,
struct xt_table *t;
int ret;
- if (*len != sizeof(struct ipt_getinfo)) {
- duprintf("length %u != %zu\n", *len,
- sizeof(struct ipt_getinfo));
+ if (*len != sizeof(struct ipt_getinfo))
return -EINVAL;
- }
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
@@ -1150,31 +1022,23 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
struct ipt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct ipt_get_entries) + get.size) {
- duprintf("get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct ipt_get_entries) + get.size)
return -EINVAL;
- }
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else
ret = -EAGAIN;
- }
+
module_put(t->me);
xt_table_unlock(t);
} else
@@ -1210,8 +1074,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
/* You lied! */
if (valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
@@ -1221,8 +1083,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
if ((oldinfo->number > oldinfo->initial_entries) ||
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
@@ -1291,8 +1151,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
@@ -1418,11 +1276,9 @@ compat_find_calc_match(struct xt_entry_match *m,
match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
m->u.user.revision);
- if (IS_ERR(match)) {
- duprintf("compat_check_calc_match: `%s' not found\n",
- m->u.user.name);
+ if (IS_ERR(match))
return PTR_ERR(match);
- }
+
m->u.kernel.match = match;
*size += xt_compat_match_offset(match);
return 0;
@@ -1454,20 +1310,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
unsigned int j;
int ret, off;
- duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
(unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p, limit = %p\n", e, limit);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset < sizeof(struct compat_ipt_entry) +
- sizeof(struct compat_xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ sizeof(struct compat_xt_entry_target))
return -EINVAL;
- }
if (!ip_checkentry(&e->ip))
return -EINVAL;
@@ -1491,8 +1341,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
- t->u.user.name);
ret = PTR_ERR(target);
goto release_matches;
}
@@ -1574,7 +1422,6 @@ translate_compat_table(struct net *net,
size = compatr->size;
info->number = compatr->num_entries;
- duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
xt_compat_init_offsets(AF_INET, compatr->num_entries);
@@ -1589,11 +1436,8 @@ translate_compat_table(struct net *net,
}
ret = -EINVAL;
- if (j != compatr->num_entries) {
- duprintf("translate_compat_table: %u not %u entries\n",
- j, compatr->num_entries);
+ if (j != compatr->num_entries)
goto out_unlock;
- }
ret = -ENOMEM;
newinfo = xt_alloc_table_info(size);
@@ -1668,8 +1512,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -EFAULT;
/* overflow check */
- if (tmp.size >= INT_MAX / num_possible_cpus())
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
@@ -1692,8 +1534,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("compat_do_replace: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
@@ -1727,7 +1567,6 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
break;
default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1777,19 +1616,15 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
struct compat_ipt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct compat_ipt_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET);
@@ -1797,16 +1632,13 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n", private->number);
ret = compat_table_info(private, &info);
- if (!ret && get.size == info.size) {
+ if (!ret && get.size == info.size)
ret = compat_copy_entries_to_user(private->size,
t, uptr->entrytable);
- } else if (!ret) {
- duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else if (!ret)
ret = -EAGAIN;
- }
+
xt_compat_flush_offsets(AF_INET);
module_put(t->me);
xt_table_unlock(t);
@@ -1859,7 +1691,6 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
break;
default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1911,7 +1742,6 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
}
default:
- duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -2013,7 +1843,6 @@ icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
/* We've been asked to examine this packet, and we
* can't. Hence, no choice but to drop.
*/
- duprintf("Dropping evil ICMP tinygram.\n");
par->hotdrop = true;
return false;
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e3c46e8e2..ae1a71a97 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net)
in->ctl_table[0].data = &nf_conntrack_max;
in->ctl_table[1].data = &net->ct.count;
- in->ctl_table[2].data = &net->ct.htable_size;
+ in->ctl_table[2].data = &nf_conntrack_htable_size;
in->ctl_table[3].data = &net->ct.sysctl_checksum;
in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
#endif
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index f0dfe92a0..c6f3c406f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -31,15 +31,14 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < net->ct.htable_size;
+ st->bucket < nf_conntrack_htable_size;
st->bucket++) {
n = rcu_dereference(
- hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= net->ct.htable_size)
+ if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
- hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
}
return head;
}
@@ -114,6 +112,23 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+static bool ct_seq_should_skip(const struct nf_conn *ct,
+ const struct net *net,
+ const struct nf_conntrack_tuple_hash *hash)
+{
+ /* we only want to print DIR_ORIGINAL */
+ if (NF_CT_DIRECTION(hash))
+ return true;
+
+ if (nf_ct_l3num(ct) != AF_INET)
+ return true;
+
+ if (!net_eq(nf_ct_net(ct), net))
+ return true;
+
+ return false;
+}
+
static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
@@ -123,14 +138,15 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
NF_CT_ASSERT(ct);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (ct_seq_should_skip(ct, seq_file_net(s), hash))
return 0;
+ if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ return 0;
- /* we only want to print DIR_ORIGINAL */
- if (NF_CT_DIRECTION(hash))
- goto release;
- if (nf_ct_l3num(ct) != AF_INET)
+ /* check if we raced w. object reuse */
+ if (!nf_ct_is_confirmed(ct) ||
+ ct_seq_should_skip(ct, seq_file_net(s), hash))
goto release;
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
@@ -220,13 +236,12 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
n = rcu_dereference(
- hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
@@ -236,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
@@ -244,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
head = rcu_dereference(
- hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
@@ -285,6 +299,9 @@ static int exp_seq_show(struct seq_file *s, void *v)
exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
+ if (!net_eq(nf_ct_net(exp->master), seq_file_net(s)))
+ return 0;
+
if (exp->tuple.src.l3num != AF_INET)
return 0;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index cf9700b1a..66ddcb605 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -737,6 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* no remote port */
}
+ ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.oif = sk->sk_bound_dev_if;
@@ -744,10 +745,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.ttl = 0;
ipc.tos = -1;
- sock_tx_timestamp(sk, &ipc.tx_flags);
-
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ err = ip_cmsg_send(sk, msg, &ipc, false);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
@@ -768,6 +767,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
+
saddr = ipc.addr;
ipc.addr = faddr = daddr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8d22de740..438f50c1a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -339,8 +339,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
struct msghdr *msg, size_t length,
- struct rtable **rtp,
- unsigned int flags)
+ struct rtable **rtp, unsigned int flags,
+ const struct sockcm_cookie *sockc)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
- sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
skb->transport_header = skb->network_header;
err = -EFAULT;
@@ -540,6 +540,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = inet->inet_daddr;
}
+ ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
@@ -548,7 +549,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(net, msg, &ipc, false);
+ err = ip_cmsg_send(sk, msg, &ipc, false);
if (unlikely(err)) {
kfree(ipc.opt);
goto out;
@@ -638,10 +639,10 @@ back_from_confirm:
if (inet->hdrincl)
err = raw_send_hdrinc(sk, &fl4, msg, len,
- &rt, msg->msg_flags);
+ &rt, msg->msg_flags, &ipc.sockc);
else {
- sock_tx_timestamp(sk, &ipc.tx_flags);
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
if (!ipc.addr)
ipc.addr = fl4.daddr;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 60398a937..a1f2830d8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -915,11 +915,11 @@ static int ip_error(struct sk_buff *skb)
if (!IN_DEV_FORWARD(in_dev)) {
switch (rt->dst.error) {
case EHOSTUNREACH:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
break;
case ENETUNREACH:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
}
goto out;
@@ -934,7 +934,7 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
code = ICMP_PKT_FILTERED;
@@ -2146,6 +2146,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
+ int master_idx;
int orig_oif;
int err = -ENETUNREACH;
@@ -2155,6 +2156,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
orig_oif = fl4->flowi4_oif;
+ master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
+ if (master_idx)
+ fl4->flowi4_oif = master_idx;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4c04f0933..e3c4043c2 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -312,11 +312,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
if (mss == 0) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 03112a310..1cb67de10 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ {
+ .procname = "fib_multipath_use_neigh",
+ .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 29bb8a5b9..99fbc1479 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -429,13 +429,16 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
{
- if (sk->sk_tsflags) {
+ if (tsflags) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- sock_tx_timestamp(sk, &shinfo->tx_flags);
- if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+ sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
+ if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+ tcb->txstamp_ack = 1;
+ if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
}
@@ -907,7 +910,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+ if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+ !tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -958,7 +962,7 @@ new_segment:
offset += copy;
size -= copy;
if (!size) {
- tcp_tx_timestamp(sk, skb);
+ tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
goto out;
}
@@ -1078,8 +1082,10 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
+ bool process_backlog = false;
bool sg;
long timeo;
@@ -1120,14 +1126,24 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
+ sockc.tsflags = sk->sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+ }
+
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- mss_now = tcp_send_mss(sk, &size_goal, flags);
-
/* Ok commence sending. */
copied = 0;
+restart:
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
@@ -1145,7 +1161,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
copy = max - skb->len;
}
- if (copy <= 0) {
+ if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
@@ -1153,6 +1169,10 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
+ if (process_backlog && sk_flush_backlog(sk)) {
+ process_backlog = false;
+ goto restart;
+ }
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation,
@@ -1160,6 +1180,7 @@ new_segment:
if (!skb)
goto wait_for_memory;
+ process_backlog = true;
/*
* Check whether we can use HW checksum.
*/
@@ -1238,7 +1259,9 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
- tcp_tx_timestamp(sk, skb);
+ tcp_tx_timestamp(sk, sockc.tsflags, skb);
+ if (unlikely(flags & MSG_EOR))
+ TCP_SKB_CB(skb)->eor = 1;
goto out;
}
@@ -1432,14 +1455,10 @@ static void tcp_prequeue_process(struct sock *sk)
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
- /* RX process wants to run with disabled BHs, though it is not
- * necessary */
- local_bh_disable();
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
- local_bh_enable();
/* Clear memory counter. */
tp->ucopy.memory = 0;
@@ -1766,7 +1785,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
chunk = len - tp->ucopy.len;
if (chunk != 0) {
- NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
}
@@ -1778,7 +1797,7 @@ do_prequeue:
chunk = len - tp->ucopy.len;
if (chunk != 0) {
- NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
@@ -1864,7 +1883,7 @@ skip_copy:
tcp_prequeue_process(sk);
if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
@@ -2054,13 +2073,13 @@ void tcp_close(struct sock *sk, long timeout)
sk->sk_prot->disconnect(sk, 0);
} else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
- NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, sk->sk_allocation);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
- NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
} else if (tcp_close_state(sk)) {
/* We FIN if the application ate all the data before
* zapping the connection.
@@ -2137,7 +2156,7 @@ adjudge_to_death:
if (tp->linger2 < 0) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
- NET_INC_STATS_BH(sock_net(sk),
+ __NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
const int tmo = tcp_fin_time(sk);
@@ -2156,7 +2175,7 @@ adjudge_to_death:
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
- NET_INC_STATS_BH(sock_net(sk),
+ __NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
}
}
@@ -3195,7 +3214,7 @@ void tcp_done(struct sock *sk)
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index fd1405d37..36087bca9 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
- cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ca->delayed_ack += cnt;
+ ca->delayed_ack += sample->pkts_acked -
+ (ca->delayed_ack >> ACK_RATIO_SHIFT);
}
}
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 167b6a3e1..03725b294 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -155,11 +155,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
ca->last_ack = now_us;
if (after(now_us, ca->round_start + base_owd)) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
return;
}
@@ -174,11 +174,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
125U);
if (ca->rtt.min > thresh) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
}
-static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- if (rtt_us <= 0)
+ if (sample->rtt_us <= 0)
return;
/* A heuristic for filtering delayed ACKs, adapted from:
@@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
* delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
*/
if (tp->sacked_out == 0) {
- if (num_acked == 1 && ca->delack) {
+ if (sample->pkts_acked == 1 && ca->delack) {
/* A delayed ACK is only used for the minimum if it is
* provenly lower than an existing non-zero minimum.
*/
- ca->rtt.min = min(ca->rtt.min, rtt_us);
+ ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
ca->delack--;
return;
- } else if (num_acked > 1 && ca->delack < 5) {
+ } else if (sample->pkts_acked > 1 && ca->delack < 5) {
ca->delack++;
}
}
- ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
- ca->rtt.max = max(ca->rtt.max, rtt_us);
+ ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
+ ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
}
static u32 tcp_cdg_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 448c2615f..c99230efc 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -402,11 +402,11 @@ static void hystart_update(struct sock *sk, u32 delay)
ca->last_ack = now;
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -423,11 +423,11 @@ static void hystart_update(struct sock *sk, u32 delay)
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -437,21 +437,21 @@ static void hystart_update(struct sock *sk, u32 delay)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;
/* Some calls are for duplicates without timetamps */
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Discard delay samples right after fast recovery */
if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
return;
- delay = (rtt_us << 3) / USEC_PER_MSEC;
+ delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
if (delay == 0)
delay = 1;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index cffd8f9ed..54d9f9b01 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
spin_unlock(&fastopenq->lock);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
return false;
}
fastopenq->rskq_rst_head = req1->dl_next;
@@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
@@ -311,13 +311,13 @@ fastopen:
child = tcp_fastopen_create_child(sk, skb, dst, req);
if (child) {
foc->len = -1;
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVE);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
return child;
}
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
valid_foc.exp = foc->exp;
*foc = valid_foc;
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 82f0d9ed6..4a4d8e767 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
}
static void measure_achieved_throughput(struct sock *sk,
- u32 pkts_acked, s32 rtt)
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk,
u32 now = tcp_time_stamp;
if (icsk->icsk_ca_state == TCP_CA_Open)
- ca->pkts_acked = pkts_acked;
+ ca->pkts_acked = sample->pkts_acked;
- if (rtt > 0)
- measure_rtt(sk, usecs_to_jiffies(rtt));
+ if (sample->rtt_us > 0)
+ measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
if (!use_bandwidth_switch)
return;
@@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk,
return;
}
- ca->packetcount += pkts_acked;
+ ca->packetcount += sample->pkts_acked;
if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
now - ca->lasttime >= ca->minRTT &&
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 2ab9bbb6f..c8e6d86be 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -82,30 +82,31 @@ static void tcp_illinois_init(struct sock *sk)
}
/* Measure RTT for each ack. */
-static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample)
{
struct illinois *ca = inet_csk_ca(sk);
+ s32 rtt_us = sample->rtt_us;
- ca->acked = pkts_acked;
+ ca->acked = sample->pkts_acked;
/* dup ack, no rtt sample */
- if (rtt < 0)
+ if (rtt_us < 0)
return;
/* ignore bogus values, this prevents wraparound in alpha math */
- if (rtt > RTT_MAX)
- rtt = RTT_MAX;
+ if (rtt_us > RTT_MAX)
+ rtt_us = RTT_MAX;
/* keep track of minimum RTT seen so far */
- if (ca->base_rtt > rtt)
- ca->base_rtt = rtt;
+ if (ca->base_rtt > rtt_us)
+ ca->base_rtt = rtt_us;
/* and max */
- if (ca->max_rtt < rtt)
- ca->max_rtt = rtt;
+ if (ca->max_rtt < rtt_us)
+ ca->max_rtt = rtt_us;
++ca->cnt_rtt;
- ca->sum_rtt += rtt;
+ ca->sum_rtt += rtt_us;
}
/* Maximum queuing delay */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 131005f82..cd3c4cbbb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -90,7 +90,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
/* rfc5961 challenge ack rate limiting */
-int sysctl_tcp_challenge_ack_limit = 100;
+int sysctl_tcp_challenge_ack_limit = 1000;
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
@@ -872,7 +872,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
else
mib_idx = LINUX_MIB_TCPSACKREORDER;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -1065,7 +1065,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
dup_sack = true;
tcp_dsack_seen(tp);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
@@ -1074,7 +1074,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
!before(start_seq_0, start_seq_1)) {
dup_sack = true;
tcp_dsack_seen(tp);
- NET_INC_STATS_BH(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
}
}
@@ -1292,7 +1292,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
@@ -1306,6 +1306,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
TCP_SKB_CB(prev)->end_seq++;
@@ -1316,7 +1317,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
return true;
}
@@ -1371,6 +1372,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
+ if (!tcp_skb_can_collapse_to(prev))
+ goto fallback;
+
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
@@ -1472,7 +1476,7 @@ noop:
return skb;
fallback:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
return NULL;
}
@@ -1660,7 +1664,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
mib_idx = LINUX_MIB_TCPSACKDISCARD;
}
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (i == 0)
first_sack_index = -1;
continue;
@@ -1912,7 +1916,7 @@ void tcp_enter_loss(struct sock *sk)
skb = tcp_write_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -2256,16 +2260,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
}
}
-/* CWND moderation, preventing bursts due to too big ACKs
- * in dubious situations.
- */
-static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
-{
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + tcp_max_burst(tp));
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2408,13 +2402,12 @@ static bool tcp_try_undo_recovery(struct sock *sk)
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
- tcp_moderate_cwnd(tp);
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
return true;
@@ -2431,7 +2424,7 @@ static bool tcp_try_undo_dsack(struct sock *sk)
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
return true;
}
return false;
@@ -2446,10 +2439,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
tcp_undo_cwnd_reduction(sk, true);
DBGUNDO(sk, "partial loss");
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPSPURIOUSRTOS);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2573,7 +2566,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2593,7 +2586,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2657,7 +2650,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
else
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
@@ -2750,7 +2743,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
DBGUNDO(sk, "partial recovery");
tcp_undo_cwnd_reduction(sk, true);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
return true;
}
@@ -3097,12 +3090,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
const struct skb_shared_info *shinfo;
/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
- if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
return;
shinfo = skb_shinfo(skb);
- if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
- !before(shinfo->tskey, prior_snd_una) &&
+ if (!before(shinfo->tskey, prior_snd_una) &&
before(shinfo->tskey, tcp_sk(sk)->snd_una))
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
}
@@ -3259,8 +3251,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
- if (icsk->icsk_ca_ops->pkts_acked)
- icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+ if (icsk->icsk_ca_ops->pkts_acked) {
+ struct ack_sample sample = { .pkts_acked = pkts_acked,
+ .rtt_us = ca_rtt_us };
+
+ icsk->icsk_ca_ops->pkts_acked(sk, &sample);
+ }
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
@@ -3366,9 +3362,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
u32 delta = ack - tp->snd_una;
- u64_stats_update_begin(&tp->syncp);
+ sock_owned_by_me((struct sock *)tp);
+ u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
- u64_stats_update_end(&tp->syncp);
+ u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@@ -3377,9 +3374,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
u32 delta = seq - tp->rcv_nxt;
- u64_stats_update_begin(&tp->syncp);
+ sock_owned_by_me((struct sock *)tp);
+ u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
- u64_stats_update_end(&tp->syncp);
+ u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}
@@ -3426,6 +3424,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
return flag;
}
+static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
+ u32 *last_oow_ack_time)
+{
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
/* Return true if we're currently rate-limiting out-of-window ACKs and
* thus shouldn't send a dupack right now. We rate-limit dupacks in
* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
@@ -3439,21 +3454,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
/* Data packets without SYNs are not likely part of an ACK loop. */
if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
!tcp_hdr(skb)->syn)
- goto not_rate_limited;
-
- if (*last_oow_ack_time) {
- s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
-
- if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
- NET_INC_STATS_BH(net, mib_idx);
- return true; /* rate-limited: don't send yet! */
- }
- }
-
- *last_oow_ack_time = tcp_time_stamp;
+ return false;
-not_rate_limited:
- return false; /* not rate-limited: go ahead, send dupack now! */
+ return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
/* RFC 5961 7 [ACK Throttling] */
@@ -3463,22 +3466,27 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
static u32 challenge_timestamp;
static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
- u32 now;
+ u32 count, now;
/* First check our per-socket dupack rate limit. */
- if (tcp_oow_rate_limited(sock_net(sk), skb,
- LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
- &tp->last_oow_ack_time))
+ if (__tcp_oow_rate_limited(sock_net(sk),
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+ &tp->last_oow_ack_time))
return;
- /* Then check the check host-wide RFC 5961 rate limit. */
+ /* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
if (now != challenge_timestamp) {
+ u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
+
challenge_timestamp = now;
- challenge_count = 0;
+ WRITE_ONCE(challenge_count, half +
+ prandom_u32_max(sysctl_tcp_challenge_ack_limit));
}
- if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ count = READ_ONCE(challenge_count);
+ if (count > 0) {
+ WRITE_ONCE(challenge_count, count - 1);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
@@ -3527,8 +3535,8 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSSPROBERECOVERY);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */
@@ -3632,14 +3640,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
@@ -4183,7 +4191,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
else
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
@@ -4207,7 +4215,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
@@ -4357,13 +4365,19 @@ static bool tcp_try_coalesce(struct sock *sk,
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
return true;
}
+static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4388,7 +4402,7 @@ static void tcp_ofo_queue(struct sock *sk)
__skb_unlink(skb, &tp->out_of_order_queue);
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -4439,8 +4453,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tcp_ecn_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
- __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ tcp_drop(sk, skb);
return;
}
@@ -4448,7 +4462,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4503,8 +4517,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4542,8 +4556,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb1);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop(sk, skb1);
}
add_sack:
@@ -4651,11 +4665,13 @@ static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb)
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- int eaten = -1;
bool fragstolen = false;
+ int eaten = -1;
- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
- goto drop;
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ __kfree_skb(skb);
+ return;
+ }
#ifdef CONFIG_TCP_STEALTH
if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
@@ -4689,14 +4705,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
__set_current_state(TASK_RUNNING);
- local_bh_enable();
if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
}
if (eaten <= 0) {
@@ -4739,14 +4753,14 @@ queue_and_out:
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return;
}
@@ -4785,7 +4799,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
__skb_unlink(skb, list);
__kfree_skb(skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
@@ -4944,7 +4958,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
@@ -4973,7 +4987,7 @@ static int tcp_prune_queue(struct sock *sk)
SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
@@ -5003,7 +5017,7 @@ static int tcp_prune_queue(struct sock *sk)
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
*/
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
@@ -5212,7 +5226,6 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
int chunk = skb->len - hlen;
int err;
- local_bh_enable();
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
else
@@ -5224,32 +5237,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
return err;
}
-static __sum16 __tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- __sum16 result;
-
- if (sock_owned_by_user(sk)) {
- local_bh_enable();
- result = __tcp_checksum_complete(skb);
- local_bh_disable();
- } else {
- result = __tcp_checksum_complete(skb);
- }
- return result;
-}
-
-static inline bool tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- return !skb_csum_unnecessary(skb) &&
- __tcp_checksum_complete_user(sk, skb);
-}
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5262,7 +5252,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
@@ -5314,8 +5304,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (th->syn) {
syn_challenge:
if (syn_inerr)
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
goto discard;
}
@@ -5323,7 +5313,7 @@ syn_challenge:
return true;
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return false;
}
@@ -5430,7 +5420,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_data_snd_check(sk);
return;
} else { /* Header too small */
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
@@ -5467,12 +5457,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
__skb_pull(skb, tcp_header_len);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
}
if (!eaten) {
- if (tcp_checksum_complete_user(sk, skb))
+ if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
@@ -5489,7 +5480,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
@@ -5516,7 +5507,7 @@ no_ack:
}
slow_path:
- if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+ if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
@@ -5546,11 +5537,11 @@ step5:
return;
csum_error:
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -5635,16 +5626,18 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
if (data) { /* Retransmit unacked data in SYN */
tcp_for_write_queue_from(data, sk) {
if (data == tcp_send_head(sk) ||
- __tcp_retransmit_skb(sk, data))
+ __tcp_retransmit_skb(sk, data, 1))
break;
}
tcp_rearm_rto(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVE);
tcp_fastopen_add_skb(sk, synack);
@@ -5679,7 +5672,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
tcp_time_stamp)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
}
@@ -5781,7 +5775,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
@@ -5888,8 +5882,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
int queued = 0;
bool acceptable;
- tp->rx_opt.saw_tstamp = 0;
-
switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;
@@ -5907,29 +5899,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
- /* Now we have several options: In theory there is
- * nothing else in the frame. KA9Q has an option to
- * send data with the syn, BSD accepts data with the
- * syn up to the [to be] advertised window and
- * Solaris 2.1 gives you a protocol error. For now
- * we just ignore it, that fits the spec precisely
- * and avoids incompatibilities. It would be nice in
- * future to drop through and process the data.
- *
- * Now that TTCP is starting to be used we ought to
- * queue this data.
- * But, this leaves one open to an easy denial of
- * service attack, and SYN cookies can't defend
- * against this problem. So, we drop the data
- * in the interest of security over speed unless
- * it's still in use.
- */
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
goto discard;
case TCP_SYN_SENT:
+ tp->rx_opt.saw_tstamp = 0;
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5941,6 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
}
+ tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
@@ -6065,7 +6042,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
@@ -6122,7 +6099,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
@@ -6142,7 +6119,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
return 0;
}
@@ -6260,10 +6237,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,
if (net->ipv4.sysctl_tcp_syncookies) {
msg = "Sending cookies";
want_cookie = true;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
if (!queue->synflood_warned &&
net->ipv4.sysctl_tcp_syncookies != 2 &&
@@ -6324,7 +6301,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6371,7 +6348,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (dst && strict &&
!tcp_peer_is_proven(req, dst, true,
tmp_opt.saw_tstamp)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
@@ -6419,7 +6396,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, false);
+ &foc, TCP_SYNACK_FASTOPEN);
/* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
sk->sk_data_ready(sk);
@@ -6429,10 +6406,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- af_ops->send_synack(sk, dst, &fl, req,
- &foc, !want_cookie);
- if (want_cookie)
- goto drop_and_free;
+ af_ops->send_synack(sk, dst, &fl, req, &foc,
+ !want_cookie ? TCP_SYNACK_NORMAL :
+ TCP_SYNACK_COOKIE);
+ if (want_cookie) {
+ reqsk_free(req);
+ return 0;
+ }
}
reqsk_put(req);
return 0;
@@ -6442,7 +6422,7 @@ drop_and_release:
drop_and_free:
reqsk_free(req);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dfd153fbd..469942798 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -158,7 +158,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = daddr = usin->sin_addr.s_addr;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
@@ -336,7 +336,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
* an established socket here.
*/
if (seq != tcp_rsk(req)->snt_isn) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
@@ -345,7 +345,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(req->rsk_listener);
}
reqsk_put(req);
}
@@ -388,7 +388,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
th->dest, iph->saddr, ntohs(th->source),
inet_iif(icmp_skb));
if (!sk) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == TCP_TIME_WAIT) {
@@ -412,13 +412,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
*/
if (sock_owned_by_user(sk)) {
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
}
if (sk->sk_state == TCP_CLOSE)
goto out;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
@@ -429,7 +429,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -644,6 +644,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
@@ -662,16 +663,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
- return;
- rcu_read_lock();
+ goto out;
+
key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
&ip_hdr(skb)->saddr, AF_INET);
if (!key)
- goto release_sk1;
+ goto out;
+
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
- goto release_sk1;
+ goto out;
+
}
if (key) {
@@ -705,20 +708,19 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
-release_sk1:
- if (sk1) {
- rcu_read_unlock();
- sock_put(sk1);
- }
+out:
+ rcu_read_unlock();
#endif
}
@@ -790,12 +792,14 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ local_bh_enable();
}
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -846,7 +850,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- bool attach_req)
+ enum tcp_synack_type synack_type)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -857,7 +861,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, attach_req);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -898,8 +902,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
- sock_owned_by_user(sk) ||
- lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
+ lockdep_sock_is_held(sk));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -944,8 +947,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ lockdep_sock_is_held(sk));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1169,12 +1171,12 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
return false;
if (hash_expected && !hash_location) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
@@ -1262,7 +1264,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
&tcp_request_sock_ipv4_ops, sk, skb);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1360,11 +1362,11 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
return newsk;
exit_overflow:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return NULL;
put_and_exit:
inet_csk_prepare_forced_close(newsk);
@@ -1461,8 +1463,8 @@ discard:
return 0;
csum_err:
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
@@ -1535,16 +1537,16 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
- if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
+ tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
struct sk_buff *skb1;
BUG_ON(sock_owned_by_user(sk));
+ __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
+ skb_queue_len(&tp->ucopy.prequeue));
- while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb1);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPPREQUEUEDROPPED);
- }
tp->ucopy.memory = 0;
} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
@@ -1565,24 +1567,25 @@ EXPORT_SYMBOL(tcp_prequeue);
int tcp_v4_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
+ bool refcounted;
struct sock *sk;
int ret;
- struct net *net = dev_net(skb->dev);
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
- TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
- th = tcp_hdr(skb);
+ th = (const struct tcphdr *)skb->data;
- if (th->doff < sizeof(struct tcphdr) / 4)
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
goto bad_packet;
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
@@ -1595,7 +1598,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
- th = tcp_hdr(skb);
+ th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
* barrier() makes sure compiler wont play fool^Waliasing games.
@@ -1615,7 +1618,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
- th->dest);
+ th->dest, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1636,7 +1639,11 @@ process:
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ /* We own a reference on the listener, increase it again
+ * as we might lose it too soon.
+ */
sock_hold(sk);
+ refcounted = true;
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
@@ -1653,7 +1660,7 @@ process:
}
}
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
@@ -1686,13 +1693,14 @@ process:
} else if (unlikely(sk_add_backlog(sk, skb,
sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
- NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
bh_unlock_sock(sk);
put_and_return:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
@@ -1702,9 +1710,9 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
- TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
- TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ __TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
}
@@ -1715,7 +1723,9 @@ discard_it:
return 0;
discard_and_relse:
- sock_put(sk);
+ sk_drops_add(sk, skb);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
do_time_wait:
@@ -1739,6 +1749,7 @@ do_time_wait:
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
+ refcounted = false;
goto process;
}
/* Fall through to ACK */
@@ -1855,7 +1866,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_free_fastopen_req(tp);
tcp_saved_syn_free(tp);
+ local_bh_disable();
sk_sockets_allocated_dec(sk);
+ local_bh_enable();
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
sock_release_memcg(sk);
@@ -1872,17 +1885,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct inet_connection_sock *icsk;
- struct hlist_nulls_node *node;
- struct sock *sk = cur;
- struct inet_listen_hashbucket *ilb;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
+ struct inet_listen_hashbucket *ilb;
+ struct inet_connection_sock *icsk;
+ struct sock *sk = cur;
if (!sk) {
+get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
- sk = sk_nulls_head(&ilb->head);
+ sk = sk_head(&ilb->head);
st->offset = 0;
goto get_sk;
}
@@ -1890,28 +1903,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
get_sk:
- sk_nulls_for_each_from(sk, node) {
+ sk_for_each_from(sk) {
if (!net_eq(sock_net(sk), net))
continue;
- if (sk->sk_family == st->family) {
- cur = sk;
- goto out;
- }
+ if (sk->sk_family == st->family)
+ return sk;
icsk = inet_csk(sk);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
- if (++st->bucket < INET_LHTABLE_SIZE) {
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock_bh(&ilb->lock);
- sk = sk_nulls_head(&ilb->head);
- goto get_sk;
- }
- cur = NULL;
-out:
- return cur;
+ if (++st->bucket < INET_LHTABLE_SIZE)
+ goto get_head;
+ return NULL;
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
@@ -2410,6 +2415,7 @@ static int __net_init tcp_sk_init(struct net *net)
IPPROTO_TCP, net);
if (res)
goto fail;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 1e70fa8fa..c67ece139 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
* newReno in increase case.
* We work it out by following the idea from TCP-LP's paper directly
*/
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
- if (rtt_us > 0)
- tcp_lp_rtt_sample(sk, rtt_us);
+ if (sample->rtt_us > 0)
+ tcp_lp_rtt_sample(sk, sample->rtt_us);
/* calc inference */
if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7b7eec439..b617826e2 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -800,7 +800,8 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
}
if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
- jiffies - tm->tcpm_stamp) < 0)
+ jiffies - tm->tcpm_stamp,
+ TCP_METRICS_ATTR_PAD) < 0)
goto nla_put_failure;
if (tm->tcpm_ts_stamp) {
if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
@@ -864,7 +865,8 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
(nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
tfom->syn_loss) < 0 ||
nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
- jiffies - tfom->last_syn_loss) < 0))
+ jiffies - tfom->last_syn_loss,
+ TCP_METRICS_ATTR_PAD) < 0))
goto nla_put_failure;
if (tfom->cookie.len > 0 &&
nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index acb366dd6..4b95ec4ed 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -235,7 +235,7 @@ kill:
}
if (paws_reject)
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
if (!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
@@ -337,7 +337,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
@@ -545,7 +545,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rack.mstamp.v64 = 0;
newtp->rack.advanced = 0;
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
return newsk;
}
@@ -704,10 +704,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
- if (!(flg & TCP_FLAG_RST))
+ if (!(flg & TCP_FLAG_RST) &&
+ !tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
if (paws_reject)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
return NULL;
}
@@ -726,7 +729,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* "fourth, check the SYN bit"
*/
if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
@@ -749,7 +752,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
return NULL;
}
@@ -788,7 +791,7 @@ embryonic_reset:
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 773083b7f..5c5964962 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -83,23 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCPV6 |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- 0) ||
- !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
@@ -107,6 +90,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
+ /* GSO partial only requires splitting the frame into an MSS
+ * multiple and possibly a remainder. So update the mss now.
+ */
+ if (features & NETIF_F_GSO_PARTIAL)
+ mss = skb->len - (skb->len % mss);
+
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -131,7 +120,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
- do {
+ while (skb->next) {
th->fin = th->psh = 0;
th->check = newcheck;
@@ -151,7 +140,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th->seq = htonl(seq);
th->cwr = 0;
- } while (skb->next);
+ }
/* Following permits TCP Small Queues to work well with GSO :
* The callback to TCP stack will be called at the time last frag
@@ -237,7 +226,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
found:
/* Include the IP ID check below from the inner most IP hdr */
- flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+ flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -246,6 +235,17 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
+ /* When we receive our second frame we can made a decision on if we
+ * continue this flow as an atomic flow with a fixed ID or if we use
+ * an incrementing ID.
+ */
+ if (NAPI_GRO_CB(p)->flush_id != 1 ||
+ NAPI_GRO_CB(p)->count != 1 ||
+ !NAPI_GRO_CB(p)->is_atomic)
+ flush |= NAPI_GRO_CB(p)->flush_id;
+ else
+ NAPI_GRO_CB(p)->is_atomic = false;
+
mss = skb_shinfo(p)->gso_size;
flush |= (len - 1) >= mss;
@@ -314,6 +314,9 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+
return tcp_gro_complete(skb);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c0c1dac81..30a7dd4f6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -236,7 +236,8 @@ void tcp_select_initial_window(int __space, __u32 mss,
/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
*/
- space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ space = max_t(u32, space, sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
@@ -364,7 +365,7 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
* be sent.
*/
static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
- int tcp_header_len)
+ struct tcphdr *th, int tcp_header_len)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -375,7 +376,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
INET_ECN_xmit(sk);
if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
- tcp_hdr(skb)->cwr = 1;
+ th->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
} else if (!tcp_ca_needs_ecn(sk)) {
@@ -383,7 +384,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
INET_ECN_dontxmit(sk);
}
if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
- tcp_hdr(skb)->ece = 1;
+ th->ece = 1;
}
}
@@ -956,12 +957,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
- skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
+ skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
- th = tcp_hdr(skb);
+ th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
@@ -969,14 +970,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- th->window = htons(min(tp->rcv_wnd, 65535U));
- } else {
- th->window = htons(tcp_select_window(sk));
- }
th->check = 0;
th->urg_ptr = 0;
@@ -993,9 +986,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
- if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
- tcp_ecn_send(sk, skb, tcp_header_size);
-
+ if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
+ th->window = htons(tcp_select_window(sk));
+ tcp_ecn_send(sk, skb, th, tcp_header_size);
+ } else {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(min(tp->rcv_wnd, 65535U));
+ }
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -1118,11 +1117,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
tcp_verify_left_out(tp);
}
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+ return TCP_SKB_CB(skb)->txstamp_ack ||
+ (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
- if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+ if (unlikely(tcp_has_tx_tstamp(skb)) &&
!before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
@@ -1130,9 +1135,17 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
shinfo->tx_flags &= ~tsflags;
shinfo2->tx_flags |= tsflags;
swap(shinfo->tskey, shinfo2->tskey);
+ TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+ TCP_SKB_CB(skb)->txstamp_ack = 0;
}
}
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+ TCP_SKB_CB(skb)->eor = 0;
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1178,6 +1191,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+ tcp_skb_fragment_eor(skb, buff);
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
/* Copy and checksum data tail into the new buffer. */
@@ -1738,6 +1752,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
+ tcp_skb_fragment_eor(skb, buff);
+
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
tcp_fragment_tstamp(skb, buff);
@@ -2211,14 +2227,13 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Thanks to skb fast clones, we can detect if a prior transmit of
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
- * Note: This is called from BH context only.
*/
static bool skb_still_in_host_queue(const struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
}
return false;
@@ -2273,14 +2288,14 @@ void tcp_send_loss_probe(struct sock *sk)
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
- if (__tcp_retransmit_skb(sk, skb))
+ if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;
/* Record snd_nxt for loss detection. */
tp->tlp_high_seq = tp->snd_nxt;
probe_sent:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
rearm_timer:
@@ -2451,14 +2466,15 @@ u32 __tcp_select_window(struct sock *sk)
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb)
{
- const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
- u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
-
- if (unlikely(tsflags)) {
+ if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+ const struct skb_shared_info *next_shinfo =
+ skb_shinfo(next_skb);
struct skb_shared_info *shinfo = skb_shinfo(skb);
- shinfo->tx_flags |= tsflags;
+ shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
shinfo->tskey = next_shinfo->tskey;
+ TCP_SKB_CB(skb)->txstamp_ack |=
+ TCP_SKB_CB(next_skb)->txstamp_ack;
}
}
@@ -2497,6 +2513,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
* packet counting does not break.
*/
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+ TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
@@ -2548,6 +2565,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!tcp_can_collapse(sk, skb))
break;
+ if (!tcp_skb_can_collapse_to(to))
+ break;
+
space -= skb->len;
if (first) {
@@ -2574,17 +2594,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
-int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss;
- int err;
+ int diff, len, err;
- /* Inconslusive MTU probe */
- if (icsk->icsk_mtup.probe_size) {
+
+ /* Inconclusive MTU probe */
+ if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
- }
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc.
@@ -2617,30 +2637,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
- if (skb->len > cur_mss) {
- if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
+ len = cur_mss * segs;
+ if (skb->len > len) {
+ if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
- int oldpcount = tcp_skb_pcount(skb);
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
- if (unlikely(oldpcount > 1)) {
- if (skb_unclone(skb, GFP_ATOMIC))
- return -ENOMEM;
- tcp_init_tso_segs(skb, cur_mss);
- tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
- }
+ diff = tcp_skb_pcount(skb);
+ tcp_set_skb_tso_segs(skb, cur_mss);
+ diff -= tcp_skb_pcount(skb);
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
+ if (skb->len < cur_mss)
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
}
/* RFC3168, section 6.1.1.1. ECN fallback */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
- tcp_retrans_try_collapse(sk, skb, cur_mss);
-
- /* Make a copy, if the first transmission SKB clone we made
- * is still in somebody's hands, else make a clone.
- */
-
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
@@ -2658,20 +2675,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
}
if (likely(!err)) {
+ segs = tcp_skb_pcount(skb);
+
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
/* Update global TCP statistics. */
- TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans++;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
}
return err;
}
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
struct tcp_sock *tp = tcp_sk(sk);
- int err = __tcp_retransmit_skb(sk, skb);
+ int err = __tcp_retransmit_skb(sk, skb, segs);
if (err == 0) {
#if FASTRETRANS_DEBUG > 0
@@ -2687,7 +2706,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tp->retrans_stamp = tcp_skb_timestamp(skb);
} else if (err != -EBUSY) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2740,7 +2759,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 last_lost;
+ u32 max_segs, last_lost;
int mib_idx;
int fwd_rexmitting = 0;
@@ -2760,8 +2779,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
+ max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
+ int segs;
if (skb == tcp_send_head(sk))
break;
@@ -2769,15 +2790,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!hole)
tp->retransmit_skb_hint = skb;
- /* Assume this retransmit will generate
- * only one packet for congestion window
- * calculation purposes. This works because
- * tcp_retransmit_skb() will chop up the
- * packet to be MSS sized and all the
- * packet counting works out.
- */
- if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
+ if (segs <= 0)
return;
+ /* In case tcp_shift_skb_data() have aggregated large skbs,
+ * we need to make sure not sending too bigs TSO packets
+ */
+ segs = min_t(int, segs, max_segs);
if (fwd_rexmitting) {
begin_fwd:
@@ -2813,10 +2832,10 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
@@ -2969,7 +2988,7 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- bool attach_req)
+ enum tcp_synack_type synack_type)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2989,14 +3008,22 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
- if (attach_req) {
+ switch (synack_type) {
+ case TCP_SYNACK_NORMAL:
skb_set_owner_w(skb, req_to_sk(req));
- } else {
+ break;
+ case TCP_SYNACK_COOKIE:
+ /* Under synflood, we do not attach skb to a socket,
+ * to avoid false sharing.
+ */
+ break;
+ case TCP_SYNACK_FASTOPEN:
/* sk is a const pointer, because we want to express multiple
* cpu might call us concurrently.
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
*/
skb_set_owner_w(skb, (struct sock *)sk);
+ break;
}
skb_dst_set(skb, dst);
@@ -3024,7 +3051,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
- th = tcp_hdr(skb);
+ th = (struct tcphdr *)skb->data;
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
@@ -3045,7 +3072,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
@@ -3549,10 +3576,10 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
if (!res) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
}
return res;
}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 5353085fd..e36df4fcf 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -65,8 +65,8 @@ int tcp_rack_mark_lost(struct sock *sk)
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
}
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 49bc474f8..debdd8b33 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -30,7 +30,7 @@ static void tcp_write_err(struct sock *sk)
sk->sk_error_report(sk);
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
/* Do not allow orphaned sockets to eat all our resources.
@@ -68,7 +68,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
if (do_reset)
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
return 0;
@@ -162,8 +162,8 @@ static int tcp_write_timeout(struct sock *sk)
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (tp->syn_data && icsk->icsk_retransmits == 1)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
@@ -178,8 +178,8 @@ static int tcp_write_timeout(struct sock *sk)
tp->bytes_acked <= tp->rx_opt.mss_clamp) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -209,6 +209,7 @@ static int tcp_write_timeout(struct sock *sk)
return 0;
}
+/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -228,7 +229,7 @@ void tcp_delack_timer_handler(struct sock *sk)
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
@@ -248,7 +249,7 @@ void tcp_delack_timer_handler(struct sock *sk)
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_send_ack(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
out:
@@ -265,7 +266,7 @@ static void tcp_delack_timer(unsigned long data)
tcp_delack_timer_handler(sk);
} else {
inet_csk(sk)->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
sock_hold(sk);
@@ -404,7 +405,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
+ tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
@@ -431,12 +432,12 @@ void tcp_retransmit_timer(struct sock *sk)
} else {
mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ __NET_INC_STATS(sock_net(sk), mib_idx);
}
tcp_enter_loss(sk);
- if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
+ if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* do not backoff.
*/
@@ -493,6 +494,7 @@ out_reset_timer:
out:;
}
+/* Called with BH disabled */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -549,7 +551,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
{
struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
- NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
+ __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
EXPORT_SYMBOL(tcp_syn_ack_timeout);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13951c408..4c4bac1b5 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init);
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct vegas *vegas = inet_csk_ca(sk);
u32 vrtt;
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Never allow zero rtt or baseRTT */
- vrtt = rtt_us + 1;
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < vegas->baseRTT)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index ef9da5306..248cfc0ff 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -17,7 +17,7 @@ struct vegas {
void tcp_vegas_init(struct sock *sk);
void tcp_vegas_state(struct sock *sk, u8 ca_state);
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample);
void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 0d094b995..40171e163 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -69,16 +69,17 @@ static void tcp_veno_init(struct sock *sk)
}
/* Do rtt sampling needed for Veno. */
-static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void tcp_veno_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct veno *veno = inet_csk_ca(sk);
u32 vrtt;
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Never allow zero rtt or baseRTT */
- vrtt = rtt_us + 1;
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < veno->basertt)
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c10732e39..4b03a2e2a 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -99,12 +99,13 @@ static void westwood_filter(struct westwood *w, u32 delta)
* Called after processing group of packets.
* but all westwood needs is the last sample of srtt.
*/
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void tcp_westwood_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct westwood *w = inet_csk_ca(sk);
- if (rtt > 0)
- w->rtt = usecs_to_jiffies(rtt);
+ if (sample->rtt_us > 0)
+ w->rtt = usecs_to_jiffies(sample->rtt_us);
}
/*
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 3e6a472e6..028eb046e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -56,15 +56,16 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+static void tcp_yeah_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct yeah *yeah = inet_csk_ca(sk);
if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = pkts_acked;
+ yeah->pkts_acked = sample->pkts_acked;
- tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+ tcp_vegas_pkts_acked(sk, sample);
}
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e9853dff7..e61f7cd65 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned int log)
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each(sk2, node, &hslot->head) {
+ sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(bitmap || udp_sk(sk2)->udp_port_hash == num) &&
@@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
bool match_wildcard))
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
- udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+ udp_portaddr_for_each_entry(sk2, &hslot2->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(udp_sk(sk2)->udp_port_hash == num) &&
@@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
bool match_wildcard))
{
struct net *net = sock_net(sk);
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
struct sock *sk2;
- sk_nulls_for_each(sk2, node, &hslot->head) {
+ sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
sk2->sk_family == sk->sk_family &&
@@ -333,22 +330,23 @@ found:
goto fail_unlock;
}
- sk_nulls_add_node_rcu(sk, &hslot->head);
+ sk_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock(&hslot2->lock);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
- sk->sk_family == AF_INET6)
- hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
- &hslot2->head);
+ sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
else
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
- &hslot2->head);
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
hslot2->count++;
spin_unlock(&hslot2->lock);
}
+ sock_set_flag(sk, SOCK_RCU_FREE);
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);
@@ -393,9 +391,9 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- __be32 saddr, unsigned short hnum, __be16 sport,
- __be32 daddr, __be16 dport, int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum, int dif)
{
int score;
struct inet_sock *inet;
@@ -436,52 +434,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-/*
- * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
- */
-static inline int compute_score2(struct sock *sk, struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- ipv6_only_sock(sk))
- return -1;
-
- inet = inet_sk(sk);
-
- if (inet->inet_rcv_saddr != daddr ||
- inet->inet_num != hnum)
- return -1;
-
- score = (sk->sk_family == PF_INET) ? 2 : 1;
-
- if (inet->inet_daddr) {
- if (inet->inet_daddr != saddr)
- return -1;
- score += 4;
- }
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score += 4;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 4;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
const __be16 fport)
@@ -494,45 +446,35 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
udp_ehash_secret + net_hash_mix(net));
}
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
-begin:
result = NULL;
badness = 0;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ badness = score;
+ result = sk;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -540,23 +482,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot2)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score2(result, net, saddr, sport,
- daddr, hnum, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
return result;
}
@@ -568,15 +493,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
- rcu_read_lock();
if (hslot->count > 10) {
hash2 = udp4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -586,47 +508,44 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- htonl(INADDR_ANY), hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
- rcu_read_unlock();
return result;
}
begin:
result = NULL;
badness = 0;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- score = compute_score(sk, net, saddr, hnum, sport,
- daddr, dport, dif);
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net, saddr, sport,
+ daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -634,25 +553,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score(result, net, saddr, hnum, sport,
- daddr, dport, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -663,18 +563,36 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
{
const struct iphdr *iph = ip_hdr(skb);
- return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+ return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
udptable, skb);
}
+struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport)
+{
+ return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
- &udp_table, NULL);
+ struct sock *sk;
+
+ sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ dif, &udp_table, NULL);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
@@ -723,7 +641,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex, udptable,
NULL);
if (!sk) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
}
@@ -776,7 +694,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- sock_put(sk);
+ return;
}
void udp_err(struct sk_buff *skb, u32 info)
@@ -917,13 +835,13 @@ send:
err = ip_send_skb(sock_net(sk), skb);
if (err) {
if (err == -ENOBUFS && !inet->recverr) {
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
}
} else
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_OUTDATAGRAMS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
return err;
}
@@ -1032,15 +950,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
*/
connected = 1;
}
- ipc.addr = inet->inet_saddr;
+ ipc.sockc.tsflags = sk->sk_tsflags;
+ ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- sock_tx_timestamp(sk, &ipc.tx_flags);
-
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc,
- sk->sk_family == AF_INET6);
+ err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
@@ -1065,6 +981,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
saddr = ipc.addr;
ipc.addr = faddr = daddr;
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
+
if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr)
return -EINVAL;
@@ -1192,8 +1110,8 @@ out:
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err;
@@ -1277,10 +1195,10 @@ static unsigned int first_packet_length(struct sock *sk)
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS,
- IS_UDPLITE(sk));
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
__skb_queue_tail(&list_kill, skb);
@@ -1316,14 +1234,6 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
unsigned int amount = first_packet_length(sk);
- if (amount)
- /*
- * We will only return the amount
- * of this packet since that is all
- * that will be read.
- */
- amount -= sizeof(struct udphdr);
-
return put_user(amount, (int __user *)arg);
}
@@ -1347,7 +1257,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, off = 0;
+ int peeked, peeking, off;
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
@@ -1357,15 +1267,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
try_again:
+ peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &off, &err);
if (!skb)
- goto out;
+ return err;
- ulen = skb->len - sizeof(struct udphdr);
+ ulen = skb->len;
copied = len;
- if (copied > ulen)
- copied = ulen;
+ if (copied > ulen - off)
+ copied = ulen - off;
else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
@@ -1375,18 +1286,16 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
}
if (checksum_valid || skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
- msg, copied);
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
else {
- err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
- msg);
+ err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
@@ -1396,15 +1305,16 @@ try_again:
trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
}
- goto out_free;
+ skb_free_datagram_locked(sk, skb);
+ return err;
}
if (!peeked)
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INDATAGRAMS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
sock_recv_ts_and_drops(msg, sk, skb);
@@ -1417,22 +1327,20 @@ try_again:
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
+ ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off);
err = copied;
if (flags & MSG_TRUNC)
err = ulen;
-out_free:
- skb_free_datagram_locked(sk, skb);
-out:
+ __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
slow = lock_sock_fast(sk);
if (!skb_kill_datagram(sk, skb, flags)) {
- UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
- UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
unlock_sock_fast(sk, slow);
@@ -1479,13 +1387,13 @@ void udp_lib_unhash(struct sock *sk)
spin_lock_bh(&hslot->lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- if (sk_nulls_del_node_init_rcu(sk)) {
+ if (sk_del_node_init_rcu(sk)) {
hslot->count--;
inet_sk(sk)->inet_num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
}
@@ -1518,12 +1426,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
if (hslot2 != nhslot2) {
spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
spin_lock(&nhslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
&nhslot2->head);
nhslot2->count++;
spin_unlock(&nhslot2->lock);
@@ -1553,15 +1461,15 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_incoming_cpu_update(sk);
}
- rc = sock_queue_rcv_skb(sk, skb);
+ rc = __sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
@@ -1625,9 +1533,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
ret = encap_rcv(sk, skb);
if (ret <= 0) {
- UDP_INC_STATS_BH(sock_net(sk),
- UDP_MIB_INDATAGRAMS,
- is_udplite);
+ __UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
return -ret;
}
}
@@ -1671,11 +1579,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb))
- goto csum_error;
+ goto csum_error;
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ goto drop;
+
+ udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
goto drop;
}
@@ -1694,43 +1606,14 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return rc;
csum_error:
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return -1;
}
-static void flush_stack(struct sock **stack, unsigned int count,
- struct sk_buff *skb, unsigned int final)
-{
- unsigned int i;
- struct sk_buff *skb1 = NULL;
- struct sock *sk;
-
- for (i = 0; i < count; i++) {
- sk = stack[i];
- if (likely(!skb1))
- skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-
- if (!skb1) {
- atomic_inc(&sk->sk_drops);
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- IS_UDPLITE(sk));
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
- }
-
- if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
- skb1 = NULL;
-
- sock_put(sk);
- }
- if (unlikely(skb1))
- kfree_skb(skb1);
-}
-
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -1754,14 +1637,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udp_table *udptable,
int proto)
{
- struct sock *sk, *stack[256 / sizeof(struct sock *)];
- struct hlist_nulls_node *node;
+ struct sock *sk, *first = NULL;
unsigned short hnum = ntohs(uh->dest);
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
- int dif = skb->dev->ifindex;
- unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
- bool inner_flushed = false;
+ unsigned int offset = offsetof(typeof(*sk), sk_node);
+ int dif = skb->dev->ifindex;
+ struct hlist_node *node;
+ struct sk_buff *nskb;
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
@@ -1772,23 +1655,28 @@ start_lookup:
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
- spin_lock(&hslot->lock);
- sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
- if (__udp_is_mcast_sock(net, sk,
- uh->dest, daddr,
- uh->source, saddr,
- dif, hnum)) {
- if (unlikely(count == ARRAY_SIZE(stack))) {
- flush_stack(stack, count, skb, ~0);
- inner_flushed = true;
- count = 0;
- }
- stack[count++] = sk;
- sock_hold(sk);
+ sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+ if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+ uh->source, saddr, dif, hnum))
+ continue;
+
+ if (!first) {
+ first = sk;
+ continue;
}
- }
+ nskb = skb_clone(skb, GFP_ATOMIC);
- spin_unlock(&hslot->lock);
+ if (unlikely(!nskb)) {
+ atomic_inc(&sk->sk_drops);
+ __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ continue;
+ }
+ if (udp_queue_rcv_skb(sk, nskb) > 0)
+ consume_skb(nskb);
+ }
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
@@ -1796,16 +1684,13 @@ start_lookup:
goto start_lookup;
}
- /*
- * do the slow work with no lock held
- */
- if (count) {
- flush_stack(stack, count, skb, count - 1);
+ if (first) {
+ if (udp_queue_rcv_skb(first, skb) > 0)
+ consume_skb(skb);
} else {
- if (!inner_flushed)
- UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
- proto == IPPROTO_UDPLITE);
- consume_skb(skb);
+ kfree_skb(skb);
+ __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
}
return 0;
}
@@ -1829,8 +1714,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- inet_compute_pseudo);
+ /* Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
+ */
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
@@ -1902,7 +1790,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
- sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
@@ -1920,7 +1807,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp_lib_checksum_complete(skb))
goto csum_error;
- UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
@@ -1947,9 +1834,9 @@ csum_error:
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
- UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+ __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
- UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
@@ -1963,49 +1850,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
int dif)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
- unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
struct udp_hslot *hslot = &udp_table.hash[slot];
/* Do not bother scanning a too big list */
if (hslot->count > 10)
return NULL;
- rcu_read_lock();
-begin:
- count = 0;
result = NULL;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- if (__udp_is_mcast_sock(net, sk,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum)) {
+ sk_for_each_rcu(sk, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+ rmt_port, rmt_addr, dif, hnum)) {
+ if (result)
+ return NULL;
result = sk;
- ++count;
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
- if (count != 1 ||
- unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!__udp_is_mcast_sock(net, result,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
+
return result;
}
@@ -2018,37 +1880,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif)
{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ struct sock *sk;
- rcu_read_lock();
- result = NULL;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- if (INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr, ports, dif))
- result = sk;
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie, rmt_addr,
+ loc_addr, ports, dif))
+ return sk;
/* Only check first socket in chain */
break;
}
-
- if (result) {
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr,
- ports, dif))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
- return result;
+ return NULL;
}
void udp_v4_early_demux(struct sk_buff *skb)
@@ -2056,7 +1903,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct udphdr *uh;
- struct sock *sk;
+ struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
int ours;
@@ -2088,11 +1935,9 @@ void udp_v4_early_demux(struct sk_buff *skb)
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr, dif);
- } else {
- return;
}
- if (!sk)
+ if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
return;
skb->sk = sk;
@@ -2392,14 +2237,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
for (state->bucket = start; state->bucket <= state->udp_table->mask;
++state->bucket) {
- struct hlist_nulls_node *node;
struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
- if (hlist_nulls_empty(&hslot->head))
+ if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
- sk_nulls_for_each(sk, node, &hslot->head) {
+ sk_for_each(sk, &hslot->head) {
if (!net_eq(sock_net(sk), net))
continue;
if (sk->sk_family == state->family)
@@ -2418,7 +2262,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
struct net *net = seq_file_net(seq);
do {
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
if (!sk) {
@@ -2627,12 +2471,12 @@ void __init udp_table_init(struct udp_table *table, const char *name)
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ INIT_HLIST_HEAD(&table->hash[i].head);
table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ INIT_HLIST_HEAD(&table->hash2[i].head);
table->hash2[i].count = 0;
spin_lock_init(&table->hash2[i].lock);
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index df1966f3b..3d5ccf4b1 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
int err = -EINVAL;
- struct sock *sk;
+ struct sock *sk = NULL;
struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
+ rcu_read_lock();
if (req->sdiag_family == AF_INET)
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
@@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_dport,
req->id.idiag_if, tbl, NULL);
#endif
- else
- goto out_nosk;
-
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ rcu_read_unlock();
err = -ENOENT;
if (!sk)
goto out_nosk;
@@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
- int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
s_slot = cb->args[0];
num = s_num = cb->args[1];
for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
- struct sock *sk;
- struct hlist_nulls_node *node;
struct udp_hslot *hslot = &table->hash[slot];
+ struct sock *sk;
num = 0;
- if (hlist_nulls_empty(&hslot->head))
+ if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
- sk_nulls_for_each(sk, node, &hslot->head) {
+ sk_for_each(sk, &hslot->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index e330c0e56..81f253b6f 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,18 +14,6 @@
#include <net/udp.h>
#include <net/protocol.h>
-static DEFINE_SPINLOCK(udp_offload_lock);
-static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
-
-#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
-
-struct udp_offload_priv {
- struct udp_offload *offload;
- possible_net_t net;
- struct rcu_head rcu;
- struct udp_offload_priv __rcu *next;
-};
-
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
@@ -51,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* 16 bit length field due to the header being added outside of an
* IP or IPv6 frame that was already limited to 64K - 1.
*/
- partial = csum_sub(csum_unfold(uh->check),
- (__force __wsum)htonl(skb->len));
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+ partial = (__force __wsum)uh->len;
+ else
+ partial = (__force __wsum)htonl(skb->len);
+ partial = csum_sub(csum_unfold(uh->check), partial);
/* setup inner skb. */
skb->encapsulation = 0;
@@ -101,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
do {
- __be16 len;
+ unsigned int len;
if (remcsum)
skb->ip_summed = CHECKSUM_NONE;
@@ -119,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_set_transport_header(skb, udp_offset);
- len = htons(skb->len - udp_offset);
+ len = skb->len - udp_offset;
uh = udp_hdr(skb);
- uh->len = len;
+
+ /* If we are only performing partial GSO the inner header
+ * will be using a length value equal to only one MSS sized
+ * segment instead of the entire frame.
+ */
+ if (skb_is_gso(skb)) {
+ uh->len = htons(skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)uh);
+ } else {
+ uh->len = htons(len);
+ }
if (!need_csum)
continue;
- uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
+ uh->check = ~csum_fold(csum_add(partial,
+ (__force __wsum)htonl(len)));
if (skb->encapsulation || !offload_csum) {
uh->check = gso_make_checksum(skb, ~uh->check);
@@ -179,6 +182,7 @@ out_unlock:
return segs;
}
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
@@ -205,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_IPIP |
- SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
@@ -253,64 +247,14 @@ out:
return segs;
}
-int udp_add_offload(struct net *net, struct udp_offload *uo)
-{
- struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
-
- if (!new_offload)
- return -ENOMEM;
-
- write_pnet(&new_offload->net, net);
- new_offload->offload = uo;
-
- spin_lock(&udp_offload_lock);
- new_offload->next = udp_offload_base;
- rcu_assign_pointer(udp_offload_base, new_offload);
- spin_unlock(&udp_offload_lock);
-
- return 0;
-}
-EXPORT_SYMBOL(udp_add_offload);
-
-static void udp_offload_free_routine(struct rcu_head *head)
-{
- struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
- kfree(ou_priv);
-}
-
-void udp_del_offload(struct udp_offload *uo)
-{
- struct udp_offload_priv __rcu **head = &udp_offload_base;
- struct udp_offload_priv *uo_priv;
-
- spin_lock(&udp_offload_lock);
-
- uo_priv = udp_deref_protected(*head);
- for (; uo_priv != NULL;
- uo_priv = udp_deref_protected(*head)) {
- if (uo_priv->offload == uo) {
- rcu_assign_pointer(*head,
- udp_deref_protected(uo_priv->next));
- goto unlock;
- }
- head = &uo_priv->next;
- }
- pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
-unlock:
- spin_unlock(&udp_offload_lock);
- if (uo_priv)
- call_rcu(&uo_priv->rcu, udp_offload_free_routine);
-}
-EXPORT_SYMBOL(udp_del_offload);
-
struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh)
+ struct udphdr *uh, udp_lookup_t lookup)
{
- struct udp_offload_priv *uo_priv;
struct sk_buff *p, **pp = NULL;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ struct sock *sk;
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
@@ -322,13 +266,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
NAPI_GRO_CB(skb)->encap_mark = 1;
rcu_read_lock();
- uo_priv = rcu_dereference(udp_offload_base);
- for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
- uo_priv->offload->port == uh->dest &&
- uo_priv->offload->callbacks.gro_receive)
- goto unflush;
- }
+ sk = (*lookup)(skb, uh->source, uh->dest);
+
+ if (sk && udp_sk(sk)->gro_receive)
+ goto unflush;
goto out_unlock;
unflush:
@@ -352,9 +293,7 @@ unflush:
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
- pp = uo_priv->offload->callbacks.gro_receive(head, skb,
- uo_priv->offload);
+ pp = udp_sk(sk)->gro_receive(sk, head, skb);
out_unlock:
rcu_read_unlock();
@@ -362,6 +301,7 @@ out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
+EXPORT_SYMBOL(udp_gro_receive);
static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
@@ -383,19 +323,20 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- return udp_gro_receive(head, skb, uh);
+ return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
flush:
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
-int udp_gro_complete(struct sk_buff *skb, int nhoff)
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+ udp_lookup_t lookup)
{
- struct udp_offload_priv *uo_priv;
__be16 newlen = htons(skb->len - nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
int err = -ENOSYS;
+ struct sock *sk;
uh->len = newlen;
@@ -405,22 +346,10 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
skb->encapsulation = 1;
rcu_read_lock();
-
- uo_priv = rcu_dereference(udp_offload_base);
- for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
- uo_priv->offload->port == uh->dest &&
- uo_priv->offload->callbacks.gro_complete)
- break;
- }
-
- if (uo_priv) {
- NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
- err = uo_priv->offload->callbacks.gro_complete(skb,
- nhoff + sizeof(struct udphdr),
- uo_priv->offload);
- }
-
+ sk = (*lookup)(skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_complete)
+ err = udp_sk(sk)->gro_complete(sk, skb,
+ nhoff + sizeof(struct udphdr));
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -428,6 +357,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
return err;
}
+EXPORT_SYMBOL(udp_gro_complete);
static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -442,7 +372,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
- return udp_gro_complete(skb, nhoff);
+ return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
static const struct net_offload udpv4_offload = {
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 96599d1a1..47f12c73d 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -69,6 +69,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+ udp_sk(sk)->gro_receive = cfg->gro_receive;
+ udp_sk(sk)->gro_complete = cfg->gro_complete;
udp_tunnel_encap_enable(sock);
}
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e875ffd..2343e4f2e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -218,6 +218,7 @@ config IPV6_GRE
tristate "IPv6: GRE tunnel"
select IPV6_TUNNEL
select NET_IP_TUNNEL
+ depends on NET_IPGRE_DEMUX
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -231,6 +232,15 @@ config IPV6_GRE
Saying M here will produce a module called ip6_gre. If unsure, say N.
+config IPV6_FOU
+ tristate
+ default NET_FOU && IPV6
+
+config IPV6_FOU_TUNNEL
+ tristate
+ default NET_FOU_IP_TUNNELS && IPV6_FOU
+ select IPV6_TUNNEL
+
config IPV6_MULTIPLE_TABLES
bool "IPv6: Multiple Routing Tables"
select FIB_RULES
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2fbd90bf8..6d8ea0992 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,9 +8,10 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
addrlabel.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
- exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o
+ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
+ udp_offload.o
-ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o
+ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
@@ -41,6 +42,7 @@ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
obj-$(CONFIG_IPV6_SIT) += sit.o
obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
+obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8ec4b3089..047c75a79 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -359,7 +359,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
ndev->cnf.mtu6 = dev->mtu;
- ndev->cnf.sysctl = NULL;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
if (!ndev->nd_parms) {
kfree(ndev);
@@ -3563,6 +3562,10 @@ restart:
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
+ } else {
+ if (idev->cnf.forwarding)
+ addrconf_leave_anycast(ifa);
+ addrconf_leave_solict(ifa->idev, &ifa->addr);
}
write_lock_bh(&idev->lock);
@@ -4995,15 +4998,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
{
struct inet6_ifaddr *ifp;
struct net_device *dev = idev->dev;
- bool update_rs = false;
+ bool clear_token, update_rs = false;
struct in6_addr ll_addr;
ASSERT_RTNL();
if (!token)
return -EINVAL;
- if (ipv6_addr_any(token))
- return -EINVAL;
if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
return -EINVAL;
if (!ipv6_accept_ra(idev))
@@ -5018,10 +5019,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
write_unlock_bh(&idev->lock);
+ clear_token = ipv6_addr_any(token);
+ if (clear_token)
+ goto update_lft;
+
if (!idev->dead && (idev->if_flags & IF_READY) &&
!ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
IFA_F_OPTIMISTIC)) {
-
/* If we're not ready, then normal ifup will take care
* of this. Otherwise, we need to request our rs here.
*/
@@ -5029,6 +5033,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
update_rs = true;
}
+update_lft:
write_lock_bh(&idev->lock);
if (update_rs) {
@@ -5618,376 +5623,366 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
return ret;
}
-static struct addrconf_sysctl_table
-{
- struct ctl_table_header *sysctl_header;
- struct ctl_table addrconf_vars[DEVCONF_MAX+1];
-} addrconf_sysctl __read_mostly = {
- .sysctl_header = NULL,
- .addrconf_vars = {
- {
- .procname = "forwarding",
- .data = &ipv6_devconf.forwarding,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = addrconf_sysctl_forward,
- },
- {
- .procname = "hop_limit",
- .data = &ipv6_devconf.hop_limit,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = addrconf_sysctl_hop_limit,
- },
- {
- .procname = "mtu",
- .data = &ipv6_devconf.mtu6,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = addrconf_sysctl_mtu,
- },
- {
- .procname = "accept_ra",
- .data = &ipv6_devconf.accept_ra,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "accept_redirects",
- .data = &ipv6_devconf.accept_redirects,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "autoconf",
- .data = &ipv6_devconf.autoconf,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "dad_transmits",
- .data = &ipv6_devconf.dad_transmits,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "router_solicitations",
- .data = &ipv6_devconf.rtr_solicits,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "router_solicitation_interval",
- .data = &ipv6_devconf.rtr_solicit_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "router_solicitation_delay",
- .data = &ipv6_devconf.rtr_solicit_delay,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "force_mld_version",
- .data = &ipv6_devconf.force_mld_version,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "mldv1_unsolicited_report_interval",
- .data =
- &ipv6_devconf.mldv1_unsolicited_report_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
- {
- .procname = "mldv2_unsolicited_report_interval",
- .data =
- &ipv6_devconf.mldv2_unsolicited_report_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
- {
- .procname = "use_tempaddr",
- .data = &ipv6_devconf.use_tempaddr,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "temp_valid_lft",
- .data = &ipv6_devconf.temp_valid_lft,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "temp_prefered_lft",
- .data = &ipv6_devconf.temp_prefered_lft,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "regen_max_retry",
- .data = &ipv6_devconf.regen_max_retry,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "max_desync_factor",
- .data = &ipv6_devconf.max_desync_factor,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "max_addresses",
- .data = &ipv6_devconf.max_addresses,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "accept_ra_defrtr",
- .data = &ipv6_devconf.accept_ra_defrtr,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "accept_ra_min_hop_limit",
- .data = &ipv6_devconf.accept_ra_min_hop_limit,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "accept_ra_pinfo",
- .data = &ipv6_devconf.accept_ra_pinfo,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
+static const struct ctl_table addrconf_sysctl[] = {
+ {
+ .procname = "forwarding",
+ .data = &ipv6_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_forward,
+ },
+ {
+ .procname = "hop_limit",
+ .data = &ipv6_devconf.hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_hop_limit,
+ },
+ {
+ .procname = "mtu",
+ .data = &ipv6_devconf.mtu6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_mtu,
+ },
+ {
+ .procname = "accept_ra",
+ .data = &ipv6_devconf.accept_ra,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_redirects",
+ .data = &ipv6_devconf.accept_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "autoconf",
+ .data = &ipv6_devconf.autoconf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "dad_transmits",
+ .data = &ipv6_devconf.dad_transmits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_solicitations",
+ .data = &ipv6_devconf.rtr_solicits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_solicitation_interval",
+ .data = &ipv6_devconf.rtr_solicit_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "router_solicitation_delay",
+ .data = &ipv6_devconf.rtr_solicit_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "force_mld_version",
+ .data = &ipv6_devconf.force_mld_version,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mldv1_unsolicited_report_interval",
+ .data =
+ &ipv6_devconf.mldv1_unsolicited_report_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "mldv2_unsolicited_report_interval",
+ .data =
+ &ipv6_devconf.mldv2_unsolicited_report_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "use_tempaddr",
+ .data = &ipv6_devconf.use_tempaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "temp_valid_lft",
+ .data = &ipv6_devconf.temp_valid_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "temp_prefered_lft",
+ .data = &ipv6_devconf.temp_prefered_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "regen_max_retry",
+ .data = &ipv6_devconf.regen_max_retry,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_desync_factor",
+ .data = &ipv6_devconf.max_desync_factor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_addresses",
+ .data = &ipv6_devconf.max_addresses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_defrtr",
+ .data = &ipv6_devconf.accept_ra_defrtr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_min_hop_limit",
+ .data = &ipv6_devconf.accept_ra_min_hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_pinfo",
+ .data = &ipv6_devconf.accept_ra_pinfo,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IPV6_ROUTER_PREF
- {
- .procname = "accept_ra_rtr_pref",
- .data = &ipv6_devconf.accept_ra_rtr_pref,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "router_probe_interval",
- .data = &ipv6_devconf.rtr_probe_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
+ {
+ .procname = "accept_ra_rtr_pref",
+ .data = &ipv6_devconf.accept_ra_rtr_pref,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_probe_interval",
+ .data = &ipv6_devconf.rtr_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
#ifdef CONFIG_IPV6_ROUTE_INFO
- {
- .procname = "accept_ra_rt_info_max_plen",
- .data = &ipv6_devconf.accept_ra_rt_info_max_plen,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
+ {
+ .procname = "accept_ra_rt_info_max_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_max_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#endif
- {
- .procname = "proxy_ndp",
- .data = &ipv6_devconf.proxy_ndp,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = addrconf_sysctl_proxy_ndp,
- },
- {
- .procname = "accept_source_route",
- .data = &ipv6_devconf.accept_source_route,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
+ {
+ .procname = "proxy_ndp",
+ .data = &ipv6_devconf.proxy_ndp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_proxy_ndp,
+ },
+ {
+ .procname = "accept_source_route",
+ .data = &ipv6_devconf.accept_source_route,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
- {
- .procname = "optimistic_dad",
- .data = &ipv6_devconf.optimistic_dad,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
-
- },
- {
- .procname = "use_optimistic",
- .data = &ipv6_devconf.use_optimistic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
-
- },
+ {
+ .procname = "optimistic_dad",
+ .data = &ipv6_devconf.optimistic_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "use_optimistic",
+ .data = &ipv6_devconf.use_optimistic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#ifdef CONFIG_IPV6_MROUTE
- {
- .procname = "mc_forwarding",
- .data = &ipv6_devconf.mc_forwarding,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
+ {
+ .procname = "mc_forwarding",
+ .data = &ipv6_devconf.mc_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
#endif
- {
- .procname = "disable_ipv6",
- .data = &ipv6_devconf.disable_ipv6,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = addrconf_sysctl_disable,
- },
- {
- .procname = "accept_dad",
- .data = &ipv6_devconf.accept_dad,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "force_tllao",
- .data = &ipv6_devconf.force_tllao,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "ndisc_notify",
- .data = &ipv6_devconf.ndisc_notify,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "suppress_frag_ndisc",
- .data = &ipv6_devconf.suppress_frag_ndisc,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "accept_ra_from_local",
- .data = &ipv6_devconf.accept_ra_from_local,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "accept_ra_mtu",
- .data = &ipv6_devconf.accept_ra_mtu,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "stable_secret",
- .data = &ipv6_devconf.stable_secret,
- .maxlen = IPV6_MAX_STRLEN,
- .mode = 0600,
- .proc_handler = addrconf_sysctl_stable_secret,
- },
- {
- .procname = "use_oif_addrs_only",
- .data = &ipv6_devconf.use_oif_addrs_only,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ignore_routes_with_linkdown",
- .data = &ipv6_devconf.ignore_routes_with_linkdown,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown,
- },
- {
- .procname = "drop_unicast_in_l2_multicast",
- .data = &ipv6_devconf.drop_unicast_in_l2_multicast,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "drop_unsolicited_na",
- .data = &ipv6_devconf.drop_unsolicited_na,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "keep_addr_on_down",
- .data = &ipv6_devconf.keep_addr_on_down,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
-
- },
- {
- /* sentinel */
- }
+ {
+ .procname = "disable_ipv6",
+ .data = &ipv6_devconf.disable_ipv6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_disable,
+ },
+ {
+ .procname = "accept_dad",
+ .data = &ipv6_devconf.accept_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "force_tllao",
+ .data = &ipv6_devconf.force_tllao,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ndisc_notify",
+ .data = &ipv6_devconf.ndisc_notify,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
},
+ {
+ .procname = "suppress_frag_ndisc",
+ .data = &ipv6_devconf.suppress_frag_ndisc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "accept_ra_from_local",
+ .data = &ipv6_devconf.accept_ra_from_local,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_mtu",
+ .data = &ipv6_devconf.accept_ra_mtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "stable_secret",
+ .data = &ipv6_devconf.stable_secret,
+ .maxlen = IPV6_MAX_STRLEN,
+ .mode = 0600,
+ .proc_handler = addrconf_sysctl_stable_secret,
+ },
+ {
+ .procname = "use_oif_addrs_only",
+ .data = &ipv6_devconf.use_oif_addrs_only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ignore_routes_with_linkdown",
+ .data = &ipv6_devconf.ignore_routes_with_linkdown,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown,
+ },
+ {
+ .procname = "drop_unicast_in_l2_multicast",
+ .data = &ipv6_devconf.drop_unicast_in_l2_multicast,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "drop_unsolicited_na",
+ .data = &ipv6_devconf.drop_unsolicited_na,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "keep_addr_on_down",
+ .data = &ipv6_devconf.keep_addr_on_down,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
+ /* sentinel */
+ }
};
static int __addrconf_sysctl_register(struct net *net, char *dev_name,
struct inet6_dev *idev, struct ipv6_devconf *p)
{
int i;
- struct addrconf_sysctl_table *t;
+ struct ctl_table *table;
char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
- t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
- if (!t)
+ table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL);
+ if (!table)
goto out;
- for (i = 0; t->addrconf_vars[i].data; i++) {
- t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
- t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
- t->addrconf_vars[i].extra2 = net;
+ for (i = 0; table[i].data; i++) {
+ table[i].data += (char *)p - (char *)&ipv6_devconf;
+ table[i].extra1 = idev; /* embedded; no ref */
+ table[i].extra2 = net;
}
snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
- t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars);
- if (!t->sysctl_header)
+ p->sysctl_header = register_net_sysctl(net, path, table);
+ if (!p->sysctl_header)
goto free;
- p->sysctl = t;
return 0;
free:
- kfree(t);
+ kfree(table);
out:
return -ENOBUFS;
}
static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
{
- struct addrconf_sysctl_table *t;
+ struct ctl_table *table;
- if (!p->sysctl)
+ if (!p->sysctl_header)
return;
- t = p->sysctl;
- p->sysctl = NULL;
- unregister_net_sysctl_table(t->sysctl_header);
- kfree(t);
+ table = p->sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(p->sysctl_header);
+ p->sysctl_header = NULL;
+ kfree(table);
}
static int addrconf_sysctl_register(struct inet6_dev *idev)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b11c37cfd..bfa86f040 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -64,6 +64,8 @@
#include <asm/uaccess.h>
#include <linux/mroute6.h>
+#include "ip6_offload.h"
+
MODULE_AUTHOR("Cast of dozens");
MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
MODULE_LICENSE("GPL");
@@ -561,6 +563,7 @@ const struct proto_ops inet6_dgram_ops = {
.recvmsg = inet_recvmsg, /* ok */
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
+ .set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -958,6 +961,10 @@ static int __init inet6_init(void)
if (err)
goto udplitev6_fail;
+ err = udpv6_offload_init();
+ if (err)
+ goto udpv6_offload_fail;
+
err = tcpv6_init();
if (err)
goto tcpv6_fail;
@@ -987,6 +994,8 @@ pingv6_fail:
ipv6_packet_fail:
tcpv6_exit();
tcpv6_fail:
+ udpv6_offload_exit();
+udpv6_offload_fail:
udplitev6_exit();
udplitev6_fail:
udpv6_exit();
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 9dd3882fe..37874e2f3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -450,9 +450,10 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
copied = len;
}
err = skb_copy_datagram_msg(skb, 0, msg, copied);
- if (err)
- goto out_free_skb;
-
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
sock_recv_timestamp(msg, sk, skb);
serr = SKB_EXT_ERR(skb);
@@ -509,8 +510,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
-out_free_skb:
- kfree_skb(skb);
+ consume_skb(skb);
out:
return err;
}
@@ -727,13 +727,13 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
struct msghdr *msg, struct flowi6 *fl6,
- struct ipv6_txoptions *opt,
- int *hlimit, int *tclass, int *dontfrag)
+ struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc)
{
struct in6_pktinfo *src_info;
struct cmsghdr *cmsg;
struct ipv6_rt_hdr *rthdr;
struct ipv6_opt_hdr *hdr;
+ struct ipv6_txoptions *opt = ipc6->opt;
int len;
int err = 0;
@@ -745,6 +745,13 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
goto exit_f;
}
+ if (cmsg->cmsg_level == SOL_SOCKET) {
+ err = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ if (err)
+ return err;
+ continue;
+ }
+
if (cmsg->cmsg_level != SOL_IPV6)
continue;
@@ -946,8 +953,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
goto exit_f;
}
- *hlimit = *(int *)CMSG_DATA(cmsg);
- if (*hlimit < -1 || *hlimit > 0xff) {
+ ipc6->hlimit = *(int *)CMSG_DATA(cmsg);
+ if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) {
err = -EINVAL;
goto exit_f;
}
@@ -967,7 +974,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
goto exit_f;
err = 0;
- *tclass = tc;
+ ipc6->tclass = tc;
break;
}
@@ -985,7 +992,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
goto exit_f;
err = 0;
- *dontfrag = df;
+ ipc6->dontfrag = df;
break;
}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index ea7c4d64a..8de5dd7aa 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -258,8 +258,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -280,8 +280,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
return 1;
}
- IP6_INC_STATS_BH(dev_net(dst->dev),
- ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(dev_net(dst->dev),
+ ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
return -1;
}
@@ -309,8 +309,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -319,8 +319,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
skb->pkt_type != PACKET_HOST) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -334,8 +334,8 @@ looped_back:
* processed by own
*/
if (!addr) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -360,8 +360,8 @@ looped_back:
goto unknown_rh;
/* Silently discard invalid RTH type 2 */
if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -379,8 +379,8 @@ looped_back:
n = hdr->hdrlen >> 1;
if (hdr->segments_left > n) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
((&hdr->segments_left) -
skb_network_header(skb)));
@@ -393,8 +393,8 @@ looped_back:
if (skb_cloned(skb)) {
/* the copy is a forwarded packet */
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTDISCARDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return -1;
}
@@ -416,14 +416,14 @@ looped_back:
if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
(xfrm_address_t *)&ipv6_hdr(skb)->saddr,
IPPROTO_ROUTING) < 0) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -434,8 +434,8 @@ looped_back:
}
if (ipv6_addr_is_multicast(addr)) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -454,8 +454,8 @@ looped_back:
if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
0);
kfree_skb(skb);
@@ -470,7 +470,7 @@ looped_back:
return -1;
unknown_rh:
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
(&hdr->type) - skb_network_header(skb));
return -1;
@@ -568,28 +568,28 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
nh[optoff+1]);
- IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INHDRERRORS);
goto drop;
}
pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
- IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
return false;
}
if (ipv6_hdr(skb)->payload_len) {
- IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
return false;
}
if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
- IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP6_INC_STATS(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
new file mode 100644
index 000000000..9ea249b94
--- /dev/null
+++ b/net/ipv6/fou6.c
@@ -0,0 +1,140 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/fou.h>
+#include <net/ip.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+
+static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi6 *fl6, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb,
+ &fl6->saddr, &fl6->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+ __be16 sport;
+ int err;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ err = __fou_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou6_build_udp(skb, e, fl6, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(fou6_build_header);
+
+int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+ __be16 sport;
+ int err;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou6_build_udp(skb, e, fl6, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(gue6_build_header);
+
+#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL)
+
+static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou6_build_header,
+};
+
+static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue6_build_header,
+};
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip6_tnl_encap_add_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou6 ops\n");
+ return ret;
+ }
+
+ ret = ip6_tnl_encap_add_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue6 ops\n");
+ ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+ ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ ip6_tnl_encap_del_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static int __init fou6_init(void)
+{
+ int ret;
+
+ ret = ip6_tnl_encap_add_fou_ops();
+
+ return ret;
+}
+
+static void __exit fou6_fini(void)
+{
+ ip6_tnl_encap_del_fou_ops();
+}
+
+module_init(fou6_init);
+module_exit(fou6_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 0013cacf7..a4fa84076 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
- ping_err(skb, offset, info);
+ ping_err(skb, offset, ntohl(info));
}
static int icmpv6_rcv(struct sk_buff *skb);
@@ -400,10 +400,11 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
struct icmp6hdr tmp_hdr;
struct flowi6 fl6;
struct icmpv6_msg msg;
+ struct sockcm_cookie sockc_unused = {0};
+ struct ipcm6_cookie ipc6;
int iif = 0;
int addr_type = 0;
int len;
- int hlimit;
int err = 0;
u32 mark = IP6_REPLY_MARK(net, skb->mark);
@@ -505,7 +506,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
if (IS_ERR(dst))
goto out;
- hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.tclass = np->tclass;
+ ipc6.dontfrag = np->dontfrag;
+ ipc6.opt = NULL;
msg.skb = skb;
msg.offset = skb_network_offset(skb);
@@ -524,9 +528,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
err = ip6_append_data(sk, icmpv6_getfrag, &msg,
len + sizeof(struct icmp6hdr),
- sizeof(struct icmp6hdr), hlimit,
- np->tclass, NULL, &fl6, (struct rt6_info *)dst,
- MSG_DONTWAIT, np->dontfrag);
+ sizeof(struct icmp6hdr),
+ &ipc6, &fl6, (struct rt6_info *)dst,
+ MSG_DONTWAIT, &sockc_unused);
if (err) {
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
@@ -561,10 +565,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
struct flowi6 fl6;
struct icmpv6_msg msg;
struct dst_entry *dst;
+ struct ipcm6_cookie ipc6;
int err = 0;
- int hlimit;
- u8 tclass;
u32 mark = IP6_REPLY_MARK(net, skb->mark);
+ struct sockcm_cookie sockc_unused = {0};
saddr = &ipv6_hdr(skb)->daddr;
@@ -604,22 +608,24 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (IS_ERR(dst))
goto out;
- hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
-
idev = __in6_dev_get(skb->dev);
msg.skb = skb;
msg.offset = 0;
msg.type = ICMPV6_ECHO_REPLY;
- tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ ipc6.dontfrag = np->dontfrag;
+ ipc6.opt = NULL;
+
err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
- sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6,
+ sizeof(struct icmp6hdr), &ipc6, &fl6,
(struct rt6_info *)dst, MSG_DONTWAIT,
- np->dontfrag);
+ &sockc_unused);
if (err) {
- ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS);
+ __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
} else {
err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
@@ -671,7 +677,7 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
return;
out:
- ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
}
/*
@@ -707,7 +713,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
skb_set_network_header(skb, nh);
}
- ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS);
+ __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS);
saddr = &ipv6_hdr(skb)->saddr;
daddr = &ipv6_hdr(skb)->daddr;
@@ -725,7 +731,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
type = hdr->icmp6_type;
- ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type);
+ ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type);
switch (type) {
case ICMPV6_ECHO_REQUEST:
@@ -809,9 +815,9 @@ static int icmpv6_rcv(struct sk_buff *skb)
return 0;
csum_error:
- ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
+ __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
discard_it:
- ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS);
drop_no_count:
kfree_skb(skb);
return 0;
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 28542cb2b..d08fd2d48 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -23,10 +23,76 @@
#include <net/protocol.h>
#include <uapi/linux/ila.h>
+struct ila_locator {
+ union {
+ __u8 v8[8];
+ __be16 v16[4];
+ __be32 v32[2];
+ __be64 v64;
+ };
+};
+
+struct ila_identifier {
+ union {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 __space:4;
+ u8 csum_neutral:1;
+ u8 type:3;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ u8 type:3;
+ u8 csum_neutral:1;
+ u8 __space:4;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ u8 __space2[7];
+ };
+ __u8 v8[8];
+ __be16 v16[4];
+ __be32 v32[2];
+ __be64 v64;
+ };
+};
+
+enum {
+ ILA_ATYPE_IID = 0,
+ ILA_ATYPE_LUID,
+ ILA_ATYPE_VIRT_V4,
+ ILA_ATYPE_VIRT_UNI_V6,
+ ILA_ATYPE_VIRT_MULTI_V6,
+ ILA_ATYPE_RSVD_1,
+ ILA_ATYPE_RSVD_2,
+ ILA_ATYPE_RSVD_3,
+};
+
+#define CSUM_NEUTRAL_FLAG htonl(0x10000000)
+
+struct ila_addr {
+ union {
+ struct in6_addr addr;
+ struct {
+ struct ila_locator loc;
+ struct ila_identifier ident;
+ };
+ };
+};
+
+static inline struct ila_addr *ila_a2i(struct in6_addr *addr)
+{
+ return (struct ila_addr *)addr;
+}
+
+static inline bool ila_addr_is_ila(struct ila_addr *iaddr)
+{
+ return (iaddr->ident.type != ILA_ATYPE_IID);
+}
+
struct ila_params {
- __be64 locator;
- __be64 locator_match;
+ struct ila_locator locator;
+ struct ila_locator locator_match;
__wsum csum_diff;
+ u8 csum_mode;
};
static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
@@ -38,7 +104,14 @@ static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
return csum_partial(diff, sizeof(diff), 0);
}
-void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p);
+static inline bool ila_csum_neutral_set(struct ila_identifier ident)
+{
+ return !!(ident.csum_neutral);
+}
+
+void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p);
+
+void ila_init_saved_csum(struct ila_params *p);
int ila_lwt_init(void);
void ila_lwt_fini(void);
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 30613050e..0e94042d1 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -15,20 +15,52 @@
static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
{
- if (*(__be64 *)&ip6h->daddr == p->locator_match)
+ struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
+
+ if (p->locator_match.v64)
return p->csum_diff;
else
- return compute_csum_diff8((__be32 *)&ip6h->daddr,
+ return compute_csum_diff8((__be32 *)&iaddr->loc,
+ (__be32 *)&p->locator);
+}
+
+static void ila_csum_do_neutral(struct ila_addr *iaddr,
+ struct ila_params *p)
+{
+ __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
+ __wsum diff, fval;
+
+ /* Check if checksum adjust value has been cached */
+ if (p->locator_match.v64) {
+ diff = p->csum_diff;
+ } else {
+ diff = compute_csum_diff8((__be32 *)iaddr,
(__be32 *)&p->locator);
+ }
+
+ fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ?
+ ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG);
+
+ diff = csum_add(diff, fval);
+
+ *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust)));
+
+ /* Flip the csum-neutral bit. Either we are doing a SIR->ILA
+ * translation with ILA_CSUM_NEUTRAL_MAP as the csum_method
+ * and the C-bit is not set, or we are doing an ILA-SIR
+ * tranlsation and the C-bit is set.
+ */
+ iaddr->ident.csum_neutral ^= 1;
}
-void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
+static void ila_csum_adjust_transport(struct sk_buff *skb,
+ struct ila_params *p)
{
__wsum diff;
struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
size_t nhoff = sizeof(struct ipv6hdr);
- /* First update checksum */
switch (ip6h->nexthdr) {
case NEXTHDR_TCP:
if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
@@ -68,7 +100,46 @@ void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
}
/* Now change destination address */
- *(__be64 *)&ip6h->daddr = p->locator;
+ iaddr->loc = p->locator;
+}
+
+void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
+{
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
+
+ /* First deal with the transport checksum */
+ if (ila_csum_neutral_set(iaddr->ident)) {
+ /* C-bit is set in the locator indicating that this
+ * is a locator being translated to a SIR address.
+ * Perform (receiver) checksum-neutral translation.
+ */
+ ila_csum_do_neutral(iaddr, p);
+ } else {
+ switch (p->csum_mode) {
+ case ILA_CSUM_ADJUST_TRANSPORT:
+ ila_csum_adjust_transport(skb, p);
+ break;
+ case ILA_CSUM_NEUTRAL_MAP:
+ ila_csum_do_neutral(iaddr, p);
+ break;
+ case ILA_CSUM_NO_ACTION:
+ break;
+ }
+ }
+
+ /* Now change destination address */
+ iaddr->loc = p->locator;
+}
+
+void ila_init_saved_csum(struct ila_params *p)
+{
+ if (!p->locator_match.v64)
+ return;
+
+ p->csum_diff = compute_csum_diff8(
+ (__be32 *)&p->locator_match,
+ (__be32 *)&p->locator);
}
static int __init ila_init(void)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 41f18de5d..1dfb64166 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -26,7 +26,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
+ ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
return dst->lwtstate->orig_output(net, sk, skb);
@@ -42,7 +42,7 @@ static int ila_input(struct sk_buff *skb)
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
+ ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
return dst->lwtstate->orig_input(skb);
@@ -53,6 +53,7 @@ drop:
static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+ [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
};
static int ila_build_state(struct net_device *dev, struct nlattr *nla,
@@ -64,11 +65,28 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
size_t encap_len = sizeof(*p);
struct lwtunnel_state *newts;
const struct fib6_config *cfg6 = cfg;
+ struct ila_addr *iaddr;
int ret;
if (family != AF_INET6)
return -EINVAL;
+ if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) {
+ /* Need to have full locator and at least type field
+ * included in destination
+ */
+ return -EINVAL;
+ }
+
+ iaddr = (struct ila_addr *)&cfg6->fc_dst;
+
+ if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) {
+ /* Don't allow translation for a non-ILA address or checksum
+ * neutral flag to be set.
+ */
+ return -EINVAL;
+ }
+
ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla,
ila_nl_policy);
if (ret < 0)
@@ -84,16 +102,19 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
newts->len = encap_len;
p = ila_params_lwtunnel(newts);
- p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
+ p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
- if (cfg6->fc_dst_len > sizeof(__be64)) {
- /* Precompute checksum difference for translation since we
- * know both the old locator and the new one.
- */
- p->locator_match = *(__be64 *)&cfg6->fc_dst;
- p->csum_diff = compute_csum_diff8(
- (__be32 *)&p->locator_match, (__be32 *)&p->locator);
- }
+ /* Precompute checksum difference for translation since we
+ * know both the old locator and the new one.
+ */
+ p->locator_match = iaddr->loc;
+ p->csum_diff = compute_csum_diff8(
+ (__be32 *)&p->locator_match, (__be32 *)&p->locator);
+
+ if (tb[ILA_ATTR_CSUM_MODE])
+ p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
+
+ ila_init_saved_csum(p);
newts->type = LWTUNNEL_ENCAP_ILA;
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
@@ -109,7 +130,10 @@ static int ila_fill_encap_info(struct sk_buff *skb,
{
struct ila_params *p = ila_params_lwtunnel(lwtstate);
- if (nla_put_u64(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator))
+ if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
+ ILA_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode))
goto nla_put_failure;
return 0;
@@ -120,7 +144,9 @@ nla_put_failure:
static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
{
- return nla_total_size(sizeof(u64)); /* ILA_ATTR_LOCATOR */
+ return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */
+ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */
+ 0;
}
static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
@@ -128,7 +154,7 @@ static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
struct ila_params *a_p = ila_params_lwtunnel(a);
struct ila_params *b_p = ila_params_lwtunnel(b);
- return (a_p->locator != b_p->locator);
+ return (a_p->locator.v64 != b_p->locator.v64);
}
static const struct lwtunnel_encap_ops ila_encap_ops = {
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 295ca29a2..a90e57229 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -11,13 +11,11 @@
struct ila_xlat_params {
struct ila_params ip;
- __be64 identifier;
int ifindex;
- unsigned int dir;
};
struct ila_map {
- struct ila_xlat_params p;
+ struct ila_xlat_params xp;
struct rhash_head node;
struct ila_map __rcu *next;
struct rcu_head rcu;
@@ -66,31 +64,29 @@ static __always_inline void __ila_hash_secret_init(void)
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
-static inline u32 ila_identifier_hash(__be64 identifier)
+static inline u32 ila_locator_hash(struct ila_locator loc)
{
- u32 *v = (u32 *)&identifier;
+ u32 *v = (u32 *)loc.v32;
return jhash_2words(v[0], v[1], hashrnd);
}
-static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier)
+static inline spinlock_t *ila_get_lock(struct ila_net *ilan,
+ struct ila_locator loc)
{
- return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask];
+ return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask];
}
-static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc,
- int ifindex, unsigned int dir)
+static inline int ila_cmp_wildcards(struct ila_map *ila,
+ struct ila_addr *iaddr, int ifindex)
{
- return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) ||
- (ila->p.ifindex && ila->p.ifindex != ifindex) ||
- !(ila->p.dir & dir);
+ return (ila->xp.ifindex && ila->xp.ifindex != ifindex);
}
-static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p)
+static inline int ila_cmp_params(struct ila_map *ila,
+ struct ila_xlat_params *xp)
{
- return (ila->p.ip.locator_match != p->ip.locator_match) ||
- (ila->p.ifindex != p->ifindex) ||
- (ila->p.dir != p->dir);
+ return (ila->xp.ifindex != xp->ifindex);
}
static int ila_cmpfn(struct rhashtable_compare_arg *arg,
@@ -98,17 +94,14 @@ static int ila_cmpfn(struct rhashtable_compare_arg *arg,
{
const struct ila_map *ila = obj;
- return (ila->p.identifier != *(__be64 *)arg->key);
+ return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key);
}
static inline int ila_order(struct ila_map *ila)
{
int score = 0;
- if (ila->p.ip.locator_match)
- score += 1 << 0;
-
- if (ila->p.ifindex)
+ if (ila->xp.ifindex)
score += 1 << 1;
return score;
@@ -117,7 +110,7 @@ static inline int ila_order(struct ila_map *ila)
static const struct rhashtable_params rht_params = {
.nelem_hint = 1024,
.head_offset = offsetof(struct ila_map, node),
- .key_offset = offsetof(struct ila_map, p.identifier),
+ .key_offset = offsetof(struct ila_map, xp.ip.locator_match),
.key_len = sizeof(u64), /* identifier */
.max_size = 1048576,
.min_size = 256,
@@ -136,50 +129,45 @@ static struct genl_family ila_nl_family = {
};
static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
- [ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, },
[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
- [ILA_ATTR_DIR] = { .type = NLA_U32, },
+ [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
};
static int parse_nl_config(struct genl_info *info,
- struct ila_xlat_params *p)
+ struct ila_xlat_params *xp)
{
- memset(p, 0, sizeof(*p));
-
- if (info->attrs[ILA_ATTR_IDENTIFIER])
- p->identifier = (__force __be64)nla_get_u64(
- info->attrs[ILA_ATTR_IDENTIFIER]);
+ memset(xp, 0, sizeof(*xp));
if (info->attrs[ILA_ATTR_LOCATOR])
- p->ip.locator = (__force __be64)nla_get_u64(
+ xp->ip.locator.v64 = (__force __be64)nla_get_u64(
info->attrs[ILA_ATTR_LOCATOR]);
if (info->attrs[ILA_ATTR_LOCATOR_MATCH])
- p->ip.locator_match = (__force __be64)nla_get_u64(
+ xp->ip.locator_match.v64 = (__force __be64)nla_get_u64(
info->attrs[ILA_ATTR_LOCATOR_MATCH]);
- if (info->attrs[ILA_ATTR_IFINDEX])
- p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
+ if (info->attrs[ILA_ATTR_CSUM_MODE])
+ xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]);
- if (info->attrs[ILA_ATTR_DIR])
- p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]);
+ if (info->attrs[ILA_ATTR_IFINDEX])
+ xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
return 0;
}
/* Must be called with rcu readlock */
-static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc,
+static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr,
int ifindex,
- unsigned int dir,
struct ila_net *ilan)
{
struct ila_map *ila;
- ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params);
+ ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc,
+ rht_params);
while (ila) {
- if (!ila_cmp_wildcards(ila, loc, ifindex, dir))
+ if (!ila_cmp_wildcards(ila, iaddr, ifindex))
return ila;
ila = rcu_access_pointer(ila->next);
}
@@ -188,15 +176,16 @@ static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc,
}
/* Must be called with rcu readlock */
-static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p,
+static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp,
struct ila_net *ilan)
{
struct ila_map *ila;
- ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier,
+ ila = rhashtable_lookup_fast(&ilan->rhash_table,
+ &xp->ip.locator_match,
rht_params);
while (ila) {
- if (!ila_cmp_params(ila, p))
+ if (!ila_cmp_params(ila, xp))
return ila;
ila = rcu_access_pointer(ila->next);
}
@@ -221,14 +210,14 @@ static void ila_free_cb(void *ptr, void *arg)
}
}
-static int ila_xlat_addr(struct sk_buff *skb, int dir);
+static int ila_xlat_addr(struct sk_buff *skb);
static unsigned int
ila_nf_input(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- ila_xlat_addr(skb, ILA_DIR_IN);
+ ila_xlat_addr(skb);
return NF_ACCEPT;
}
@@ -241,11 +230,11 @@ static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = {
},
};
-static int ila_add_mapping(struct net *net, struct ila_xlat_params *p)
+static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_map *ila, *head;
- spinlock_t *lock = ila_get_lock(ilan, p->identifier);
+ spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
int err = 0, order;
if (!ilan->hooks_registered) {
@@ -264,22 +253,16 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *p)
if (!ila)
return -ENOMEM;
- ila->p = *p;
+ ila_init_saved_csum(&xp->ip);
- if (p->ip.locator_match) {
- /* Precompute checksum difference for translation since we
- * know both the old identifier and the new one.
- */
- ila->p.ip.csum_diff = compute_csum_diff8(
- (__be32 *)&p->ip.locator_match,
- (__be32 *)&p->ip.locator);
- }
+ ila->xp = *xp;
order = ila_order(ila);
spin_lock(lock);
- head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier,
+ head = rhashtable_lookup_fast(&ilan->rhash_table,
+ &xp->ip.locator_match,
rht_params);
if (!head) {
/* New entry for the rhash_table */
@@ -289,7 +272,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *p)
struct ila_map *tila = head, *prev = NULL;
do {
- if (!ila_cmp_params(tila, p)) {
+ if (!ila_cmp_params(tila, xp)) {
err = -EEXIST;
goto out;
}
@@ -326,23 +309,23 @@ out:
return err;
}
-static int ila_del_mapping(struct net *net, struct ila_xlat_params *p)
+static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_map *ila, *head, *prev;
- spinlock_t *lock = ila_get_lock(ilan, p->identifier);
+ spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
int err = -ENOENT;
spin_lock(lock);
head = rhashtable_lookup_fast(&ilan->rhash_table,
- &p->identifier, rht_params);
+ &xp->ip.locator_match, rht_params);
ila = head;
prev = NULL;
while (ila) {
- if (ila_cmp_params(ila, p)) {
+ if (ila_cmp_params(ila, xp)) {
prev = ila;
ila = rcu_dereference_protected(ila->next,
lockdep_is_held(lock));
@@ -404,28 +387,28 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
- struct ila_xlat_params p;
+ struct ila_xlat_params xp;
int err;
- err = parse_nl_config(info, &p);
+ err = parse_nl_config(info, &xp);
if (err)
return err;
- ila_del_mapping(net, &p);
+ ila_del_mapping(net, &xp);
return 0;
}
static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
{
- if (nla_put_u64(msg, ILA_ATTR_IDENTIFIER,
- (__force u64)ila->p.identifier) ||
- nla_put_u64(msg, ILA_ATTR_LOCATOR,
- (__force u64)ila->p.ip.locator) ||
- nla_put_u64(msg, ILA_ATTR_LOCATOR_MATCH,
- (__force u64)ila->p.ip.locator_match) ||
- nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) ||
- nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir))
+ if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
+ (__force u64)ila->xp.ip.locator.v64,
+ ILA_ATTR_PAD) ||
+ nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH,
+ (__force u64)ila->xp.ip.locator_match.v64,
+ ILA_ATTR_PAD) ||
+ nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) ||
+ nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
return -1;
return 0;
@@ -457,11 +440,11 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
struct net *net = genl_info_net(info);
struct ila_net *ilan = net_generic(net, ila_net_id);
struct sk_buff *msg;
- struct ila_xlat_params p;
+ struct ila_xlat_params xp;
struct ila_map *ila;
int ret;
- ret = parse_nl_config(info, &p);
+ ret = parse_nl_config(info, &xp);
if (ret)
return ret;
@@ -471,7 +454,7 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
rcu_read_lock();
- ila = ila_lookup_by_params(&p, ilan);
+ ila = ila_lookup_by_params(&xp, ilan);
if (ila) {
ret = ila_dump_info(ila,
info->snd_portid,
@@ -501,7 +484,8 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
- return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter);
+ return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
+ GFP_KERNEL);
}
static int ila_nl_dump_done(struct netlink_callback *cb)
@@ -613,45 +597,32 @@ static struct pernet_operations ila_net_ops = {
.size = sizeof(struct ila_net),
};
-static int ila_xlat_addr(struct sk_buff *skb, int dir)
+static int ila_xlat_addr(struct sk_buff *skb)
{
struct ila_map *ila;
struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
struct ila_net *ilan = net_generic(net, ila_net_id);
- __be64 identifier, locator_match;
- size_t nhoff;
+ struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
/* Assumes skb contains a valid IPv6 header that is pulled */
- identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8];
- locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0];
- nhoff = sizeof(struct ipv6hdr);
+ if (!ila_addr_is_ila(iaddr)) {
+ /* Type indicates this is not an ILA address */
+ return 0;
+ }
rcu_read_lock();
- ila = ila_lookup_wildcards(identifier, locator_match,
- skb->dev->ifindex, dir, ilan);
+ ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan);
if (ila)
- update_ipv6_locator(skb, &ila->p.ip);
+ ila_update_ipv6_locator(skb, &ila->xp.ip);
rcu_read_unlock();
return 0;
}
-int ila_xlat_incoming(struct sk_buff *skb)
-{
- return ila_xlat_addr(skb, ILA_DIR_IN);
-}
-EXPORT_SYMBOL(ila_xlat_incoming);
-
-int ila_xlat_outgoing(struct sk_buff *skb)
-{
- return ila_xlat_addr(skb, ILA_DIR_OUT);
-}
-EXPORT_SYMBOL(ila_xlat_outgoing);
-
int ila_xlat_init(void)
{
int ret;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 70f2628be..00cf28ad4 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -69,7 +69,6 @@ struct sock *__inet6_lookup_established(struct net *net,
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
- rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
@@ -90,7 +89,6 @@ begin:
out:
sk = NULL;
found:
- rcu_read_unlock();
return sk;
}
EXPORT_SYMBOL(__inet6_lookup_established);
@@ -122,6 +120,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
+/* called with rcu_read_lock() */
struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -129,39 +128,27 @@ struct sock *inet6_lookup_listener(struct net *net,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif)
{
- struct sock *sk;
- const struct hlist_nulls_node *node;
- struct sock *result;
- int score, hiscore, matches = 0, reuseport = 0;
- bool select_ok = true;
- u32 phash = 0;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+ int score, hiscore = 0, matches = 0, reuseport = 0;
+ struct sock *sk, *result = NULL;
+ u32 phash = 0;
- rcu_read_lock();
-begin:
- result = NULL;
- hiscore = 0;
- sk_nulls_for_each(sk, node, &ilb->head) {
+ sk_for_each(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
- hiscore = score;
- result = sk;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet6_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
- sk2 = reuseport_select_sock(sk, phash,
- skb, doff);
- if (sk2) {
- result = sk2;
- goto found;
- }
- }
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ hiscore = score;
} else if (score == hiscore && reuseport) {
matches++;
if (reciprocal_scale(phash, matches) == 0)
@@ -169,25 +156,6 @@ begin:
phash = next_pseudo_random32(phash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
- result = NULL;
- else if (unlikely(compute_score(result, net, hnum, daddr,
- dif) < hiscore)) {
- sock_put(result);
- select_ok = false;
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
@@ -199,12 +167,12 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
const int dif)
{
struct sock *sk;
+ bool refcounted;
- local_bh_disable();
sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
- ntohs(dport), dif);
- local_bh_enable();
-
+ ntohs(dport), dif, &refcounted);
+ if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
return sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup);
@@ -254,7 +222,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
sk_nulls_del_node_init_rcu((struct sock *)tw);
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index b2025bf3d..c0cbcb259 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -78,9 +78,12 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
* we accept a checksum of zero here. When we find the socket
* for the UDP packet we'll check if that socket allows zero checksum
* for IPv6 (set by socket option).
+ *
+ * Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
*/
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- ip6_compute_pseudo);
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ ip6_compute_pseudo);
}
EXPORT_SYMBOL(udp6_csum_init);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c26fac26b..771be1fa4 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -241,6 +241,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id)
return tb;
}
+EXPORT_SYMBOL_GPL(fib6_new_table);
struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index dc2db4f7b..b912f0dba 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -372,7 +372,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
if (olen > 0) {
struct msghdr msg;
struct flowi6 flowi6;
- int junk;
+ struct sockcm_cookie sockc_junk;
+ struct ipcm6_cookie ipc6;
err = -ENOMEM;
fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL);
@@ -389,8 +390,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
msg.msg_control = (void *)(fl->opt+1);
memset(&flowi6, 0, sizeof(flowi6));
- err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt,
- &junk, &junk, &junk);
+ ipc6.opt = fl->opt;
+ err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk);
if (err)
goto done;
err = -EINVAL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4e636e60a..776d14511 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -54,6 +54,7 @@
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ip6_tunnel.h>
+#include <net/gre.h>
static bool log_ecn_error = true;
@@ -342,7 +343,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
goto failed_free;
/* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
+ if (!(nt->parms.o_flags & TUNNEL_SEQ))
dev->features |= NETIF_F_LLTX;
dev_hold(dev);
@@ -443,137 +444,41 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
t->err_time = jiffies;
}
-static int ip6gre_rcv(struct sk_buff *skb)
+static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
const struct ipv6hdr *ipv6h;
- u8 *h;
- __be16 flags;
- __sum16 csum = 0;
- __be32 key = 0;
- u32 seqno = 0;
struct ip6_tnl *tunnel;
- int offset = 4;
- __be16 gre_proto;
- int err;
-
- if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
- goto drop;
ipv6h = ipv6_hdr(skb);
- h = skb->data;
- flags = *(__be16 *)h;
-
- if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
- /* - Version must be 0.
- - We do not support routing headers.
- */
- if (flags&(GRE_VERSION|GRE_ROUTING))
- goto drop;
-
- if (flags&GRE_CSUM) {
- csum = skb_checksum_simple_validate(skb);
- offset += 4;
- }
- if (flags&GRE_KEY) {
- key = *(__be32 *)(h + offset);
- offset += 4;
- }
- if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32 *)(h + offset));
- offset += 4;
- }
- }
-
- gre_proto = *(__be16 *)(h + 2);
-
tunnel = ip6gre_tunnel_lookup(skb->dev,
- &ipv6h->saddr, &ipv6h->daddr, key,
- gre_proto);
+ &ipv6h->saddr, &ipv6h->daddr, tpi->key,
+ tpi->proto);
if (tunnel) {
- struct pcpu_sw_netstats *tstats;
-
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto drop;
-
- if (!ip6_tnl_rcv_ctl(tunnel, &ipv6h->daddr, &ipv6h->saddr)) {
- tunnel->dev->stats.rx_dropped++;
- goto drop;
- }
-
- skb->protocol = gre_proto;
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IPv6
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
- skb->protocol = htons(ETH_P_IPV6);
- if ((*(h + offset) & 0xF0) != 0x40)
- offset += 4;
- }
-
- skb->mac_header = skb->network_header;
- __pskb_pull(skb, offset);
- skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
-
- if (((flags&GRE_CSUM) && csum) ||
- (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- if (tunnel->parms.i_flags&GRE_SEQ) {
- if (!(flags&GRE_SEQ) ||
- (tunnel->i_seqno &&
- (s32)(seqno - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- tunnel->i_seqno = seqno + 1;
- }
-
- /* Warning: All skb pointers will be invalidated! */
- if (tunnel->dev->type == ARPHRD_ETHER) {
- if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
+ ip6_tnl_rcv(tunnel, skb, tpi, NULL, false);
- ipv6h = ipv6_hdr(skb);
- skb->protocol = eth_type_trans(skb, tunnel->dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- }
-
- __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+ return PACKET_RCVD;
+ }
- skb_reset_network_header(skb);
+ return PACKET_REJECT;
+}
- err = IP6_ECN_decapsulate(ipv6h, skb);
- if (unlikely(err)) {
- if (log_ecn_error)
- net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n",
- &ipv6h->saddr,
- ipv6_get_dsfield(ipv6h));
- if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
- goto drop;
- }
- }
+static int gre_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int hdr_len;
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0);
+ if (hdr_len < 0)
+ goto drop;
- netif_rx(skb);
+ if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
+ goto drop;
+ if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
return 0;
- }
- icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
@@ -584,187 +489,40 @@ struct ipv6_tel_txoption {
__u8 dst_opt[8];
};
-static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
+static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
- memset(opt, 0, sizeof(struct ipv6_tel_txoption));
-
- opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
- opt->dst_opt[3] = 1;
- opt->dst_opt[4] = encap_limit;
- opt->dst_opt[5] = IPV6_TLV_PADN;
- opt->dst_opt[6] = 1;
-
- opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
- opt->ops.opt_nflen = 8;
+ return iptunnel_handle_offloads(skb,
+ csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
-static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
- struct net_device *dev,
- __u8 dsfield,
- struct flowi6 *fl6,
- int encap_limit,
- __u32 *pmtu)
+static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
+ struct net_device *dev, __u8 dsfield,
+ struct flowi6 *fl6, int encap_limit,
+ __u32 *pmtu, __be16 proto)
{
struct ip6_tnl *tunnel = netdev_priv(dev);
- struct net *net = tunnel->net;
- struct net_device *tdev; /* Device to other host */
- struct ipv6hdr *ipv6h; /* Our new IP header */
- unsigned int max_headroom = 0; /* The extra header space needed */
- int gre_hlen;
- struct ipv6_tel_txoption opt;
- int mtu;
- struct dst_entry *dst = NULL, *ndst = NULL;
- struct net_device_stats *stats = &tunnel->dev->stats;
- int err = -1;
- u8 proto;
- struct sk_buff *new_skb;
- __be16 protocol;
+ __be16 protocol = (dev->type == ARPHRD_ETHER) ?
+ htons(ETH_P_TEB) : proto;
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
- if (dev->header_ops && dev->type == ARPHRD_IP6GRE) {
- gre_hlen = 0;
- ipv6h = (struct ipv6hdr *)skb->data;
- fl6->daddr = ipv6h->daddr;
- } else {
- gre_hlen = tunnel->hlen;
+ if (dev->header_ops && dev->type == ARPHRD_IP6GRE)
+ fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr;
+ else
fl6->daddr = tunnel->parms.raddr;
- }
-
- if (!fl6->flowi6_mark)
- dst = dst_cache_get(&tunnel->dst_cache);
-
- if (!dst) {
- dst = ip6_route_output(net, NULL, fl6);
-
- if (dst->error)
- goto tx_err_link_failure;
- dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
- if (IS_ERR(dst)) {
- err = PTR_ERR(dst);
- dst = NULL;
- goto tx_err_link_failure;
- }
- ndst = dst;
- }
-
- tdev = dst->dev;
-
- if (tdev == dev) {
- stats->collisions++;
- net_warn_ratelimited("%s: Local routing loop detected!\n",
- tunnel->parms.name);
- goto tx_err_dst_release;
- }
-
- mtu = dst_mtu(dst) - sizeof(*ipv6h);
- if (encap_limit >= 0) {
- max_headroom += 8;
- mtu -= 8;
- }
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
- if (skb->len > mtu) {
- *pmtu = mtu;
- err = -EMSGSIZE;
- goto tx_err_dst_release;
- }
-
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IP6TUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
-
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
-
- skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
-
- max_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len;
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- new_skb = skb_realloc_headroom(skb, max_headroom);
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
- if (!new_skb)
- goto tx_err_dst_release;
-
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- consume_skb(skb);
- skb = new_skb;
- }
-
- if (!fl6->flowi6_mark && ndst)
- dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr);
- skb_dst_set(skb, dst);
- proto = NEXTHDR_GRE;
- if (encap_limit >= 0) {
- init_tel_txopt(&opt, encap_limit);
- ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
- }
-
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
+ if (tunnel->parms.o_flags & TUNNEL_SEQ)
+ tunnel->o_seqno++;
- skb_push(skb, gre_hlen);
- skb_reset_network_header(skb);
- skb_set_transport_header(skb, sizeof(*ipv6h));
-
- /*
- * Push down and install the IP header.
- */
- ipv6h = ipv6_hdr(skb);
- ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
- ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
- ipv6h->hop_limit = tunnel->parms.hop_limit;
- ipv6h->nexthdr = proto;
- ipv6h->saddr = fl6->saddr;
- ipv6h->daddr = fl6->daddr;
-
- ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags;
- protocol = (dev->type == ARPHRD_ETHER) ?
- htons(ETH_P_TEB) : skb->protocol;
- ((__be16 *)(ipv6h + 1))[1] = protocol;
-
- if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4);
-
- if (tunnel->parms.o_flags&GRE_SEQ) {
- ++tunnel->o_seqno;
- *ptr = htonl(tunnel->o_seqno);
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_KEY) {
- *ptr = tunnel->parms.o_key;
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_CSUM) {
- *ptr = 0;
- *(__sum16 *)ptr = ip_compute_csum((void *)(ipv6h+1),
- skb->len - sizeof(struct ipv6hdr));
- }
- }
+ /* Push GRE header. */
+ gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
skb_set_inner_protocol(skb, protocol);
- ip6tunnel_xmit(NULL, skb, dev);
- return 0;
-tx_err_link_failure:
- stats->tx_carrier_errors++;
- dst_link_failure(skb);
-tx_err_dst_release:
- dst_release(dst);
- return err;
+ return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
+ NEXTHDR_GRE);
}
static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
@@ -783,7 +541,6 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_GRE;
dsfield = ipv4_get_dsfield(iph);
@@ -793,7 +550,12 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
- err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
+ if (err)
+ return -1;
+
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ skb->protocol);
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
@@ -833,7 +595,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_GRE;
dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
@@ -843,7 +604,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
- err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
+ return -1;
+
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit,
+ &mtu, skb->protocol);
if (err != 0) {
if (err == -EMSGSIZE)
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
@@ -887,7 +652,11 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = skb->protocol;
- err = ip6gre_xmit2(skb, dev, 0, &fl6, encap_limit, &mtu);
+ err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
+ if (err)
+ return err;
+
+ err = __gre6_xmit(skb, dev, 0, &fl6, encap_limit, &mtu, skb->protocol);
return err;
}
@@ -931,7 +700,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
struct net_device *dev = t->dev;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
- int addend = sizeof(struct ipv6hdr) + 4;
+ int t_hlen;
if (dev->type != ARPHRD_ETHER) {
memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
@@ -943,6 +712,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
fl6->daddr = p->raddr;
fl6->flowi6_oif = p->link;
fl6->flowlabel = 0;
+ fl6->flowi6_proto = IPPROTO_GRE;
if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
@@ -958,16 +728,11 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
else
dev->flags &= ~IFF_POINTOPOINT;
- /* Precalculate GRE options length */
- if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
- if (t->parms.o_flags&GRE_CSUM)
- addend += 4;
- if (t->parms.o_flags&GRE_KEY)
- addend += 4;
- if (t->parms.o_flags&GRE_SEQ)
- addend += 4;
- }
- t->hlen = addend;
+ t->tun_hlen = gre_calc_hlen(t->parms.o_flags);
+
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
if (p->flags & IP6_TNL_F_CAP_XMIT) {
int strict = (ipv6_addr_type(&p->raddr) &
@@ -981,12 +746,15 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
return;
if (rt->dst.dev) {
- dev->hard_header_len = rt->dst.dev->hard_header_len + addend;
+ dev->hard_header_len = rt->dst.dev->hard_header_len +
+ t_hlen;
if (set_mtu) {
- dev->mtu = rt->dst.dev->mtu - addend;
+ dev->mtu = rt->dst.dev->mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
+ if (dev->type == ARPHRD_ETHER)
+ dev->mtu -= ETH_HLEN;
if (dev->mtu < IPV6_MIN_MTU)
dev->mtu = IPV6_MIN_MTU;
@@ -1028,8 +796,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p,
p->link = u->link;
p->i_key = u->i_key;
p->o_key = u->o_key;
- p->i_flags = u->i_flags;
- p->o_flags = u->o_flags;
+ p->i_flags = gre_flags_to_tnl_flags(u->i_flags);
+ p->o_flags = gre_flags_to_tnl_flags(u->o_flags);
memcpy(p->name, u->name, sizeof(u->name));
}
@@ -1046,8 +814,8 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
u->link = p->link;
u->i_key = p->i_key;
u->o_key = p->o_key;
- u->i_flags = p->i_flags;
- u->o_flags = p->o_flags;
+ u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+ u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
memcpy(u->name, p->name, sizeof(u->name));
}
@@ -1061,6 +829,8 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
struct net *net = t->net;
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ memset(&p1, 0, sizeof(p1));
+
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ign->fb_tunnel_dev) {
@@ -1160,15 +930,6 @@ done:
return err;
}
-static int ip6gre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 68 ||
- new_mtu > 0xFFF8 - dev->hard_header_len)
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr, unsigned int len)
@@ -1212,7 +973,7 @@ static const struct net_device_ops ip6gre_netdev_ops = {
.ndo_uninit = ip6gre_tunnel_uninit,
.ndo_start_xmit = ip6gre_tunnel_xmit,
.ndo_do_ioctl = ip6gre_tunnel_ioctl,
- .ndo_change_mtu = ip6gre_tunnel_change_mtu,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1228,17 +989,11 @@ static void ip6gre_dev_free(struct net_device *dev)
static void ip6gre_tunnel_setup(struct net_device *dev)
{
- struct ip6_tnl *t;
-
dev->netdev_ops = &ip6gre_netdev_ops;
dev->destructor = ip6gre_dev_free;
dev->type = ARPHRD_IP6GRE;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr) + 4;
- dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr) - 4;
- t = netdev_priv(dev);
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
netif_keep_dst(dev);
@@ -1248,6 +1003,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
{
struct ip6_tnl *tunnel;
int ret;
+ int t_hlen;
tunnel = netdev_priv(dev);
@@ -1266,6 +1022,17 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
return ret;
}
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+
+ dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->mtu -= ETH_HLEN;
+ if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+
return 0;
}
@@ -1304,7 +1071,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
static struct inet6_protocol ip6gre_protocol __read_mostly = {
- .handler = ip6gre_rcv,
+ .handler = gre_rcv,
.err_handler = ip6gre_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
@@ -1448,10 +1215,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ parms->i_flags = gre_flags_to_tnl_flags(
+ nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ parms->o_flags = gre_flags_to_tnl_flags(
+ nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1487,6 +1256,8 @@ static int ip6gre_tap_init(struct net_device *dev)
if (ret)
return ret;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
tunnel = netdev_priv(dev);
ip6gre_tnl_link_config(tunnel, 1);
@@ -1500,11 +1271,16 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
.ndo_start_xmit = ip6gre_tunnel_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
- .ndo_change_mtu = ip6gre_tunnel_change_mtu,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
+#define GRE6_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
+
static void ip6gre_tap_setup(struct net_device *dev)
{
@@ -1515,6 +1291,40 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+}
+
+static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
}
static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
@@ -1523,9 +1333,18 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
struct ip6_tnl *nt;
struct net *net = dev_net(dev);
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct ip_tunnel_encap ipencap;
int err;
nt = netdev_priv(dev);
+
+ if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
ip6gre_netlink_parms(data, &nt->parms);
if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
@@ -1538,9 +1357,25 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
nt->net = dev_net(dev);
ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= GRE6_FEATURES;
+ dev->hw_features |= GRE6_FEATURES;
+
+ if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported, nor
+ * can we support 2 levels of outer headers requiring
+ * an update.
+ */
+ if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
+ (nt->encap.type == TUNNEL_ENCAP_NONE)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
+
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
dev->features |= NETIF_F_LLTX;
+ }
err = register_netdevice(dev);
if (err)
@@ -1560,10 +1395,18 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
struct net *net = nt->net;
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
struct __ip6_tnl_parm p;
+ struct ip_tunnel_encap ipencap;
if (dev == ign->fb_tunnel_dev)
return -EINVAL;
+ if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
ip6gre_netlink_parms(data, &p);
t = ip6gre_tunnel_locate(net, &p, 0);
@@ -1609,14 +1452,20 @@ static size_t ip6gre_get_size(const struct net_device *dev)
nla_total_size(sizeof(struct in6_addr)) +
/* IFLA_GRE_TTL */
nla_total_size(1) +
- /* IFLA_GRE_TOS */
- nla_total_size(1) +
/* IFLA_GRE_ENCAP_LIMIT */
nla_total_size(1) +
/* IFLA_GRE_FLOWINFO */
nla_total_size(4) +
/* IFLA_GRE_FLAGS */
nla_total_size(4) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
0;
}
@@ -1626,18 +1475,30 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct __ip6_tnl_parm *p = &t->parms;
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
- nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
- nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS,
+ gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS,
+ gre_tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) ||
nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) ||
- /*nla_put_u8(skb, IFLA_GRE_TOS, t->priority) ||*/
nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags))
goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.flags))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -1656,6 +1517,10 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
[IFLA_GRE_FLOWINFO] = { .type = NLA_U32 },
[IFLA_GRE_FLAGS] = { .type = NLA_U32 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index c05c425c2..94611e450 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,13 @@
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot;
@@ -78,11 +85,11 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
idev = __in6_dev_get(skb->dev);
- IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len);
+ __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
!idev || unlikely(idev->cnf.disable_ipv6)) {
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -109,10 +116,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (hdr->version != 6)
goto err;
- IP6_ADD_STATS_BH(net, idev,
- IPSTATS_MIB_NOECTPKTS +
+ __IP6_ADD_STATS(net, idev,
+ IPSTATS_MIB_NOECTPKTS +
(ipv6_get_dsfield(hdr) & INET_ECN_MASK),
- max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
/*
* RFC4291 2.5.3
* A packet received on an interface with a destination address
@@ -169,12 +176,12 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
/* pkt_len may be zero if Jumbo payload option is present */
if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
- IP6_INC_STATS_BH(net,
- idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP6_INC_STATS(net,
+ idev, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
goto drop;
}
hdr = ipv6_hdr(skb);
@@ -182,7 +189,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (hdr->nexthdr == NEXTHDR_HOP) {
if (ipv6_parse_hopopts(skb) < 0) {
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
rcu_read_unlock();
return NET_RX_DROP;
}
@@ -197,7 +204,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
net, NULL, skb, dev, NULL,
ip6_rcv_finish);
err:
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
drop:
rcu_read_unlock();
kfree_skb(skb);
@@ -216,6 +223,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
unsigned int nhoff;
int nexthdr;
bool raw;
+ bool have_final = false;
/*
* Parse extension headers
@@ -229,14 +237,27 @@ resubmit:
nhoff = IP6CB(skb)->nhoff;
nexthdr = skb_network_header(skb)[nhoff];
+resubmit_final:
raw = raw6_local_deliver(skb, nexthdr);
ipprot = rcu_dereference(inet6_protos[nexthdr]);
if (ipprot) {
int ret;
- if (ipprot->flags & INET6_PROTO_FINAL) {
+ if (have_final) {
+ if (!(ipprot->flags & INET6_PROTO_FINAL)) {
+ /* Once we've seen a final protocol don't
+ * allow encapsulation on any non-final
+ * ones. This allows foo in UDP encapsulation
+ * to work.
+ */
+ goto discard;
+ }
+ } else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+ /* Only do this once for first final protocol */
+ have_final = true;
+
/* Free reference early: we don't need it any more,
and it may hold ip_conntrack module loaded
indefinitely. */
@@ -256,21 +277,32 @@ resubmit:
goto discard;
ret = ipprot->handler(skb);
- if (ret > 0)
- goto resubmit;
- else if (ret == 0)
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+ if (ret > 0) {
+ if (ipprot->flags & INET6_PROTO_FINAL) {
+ /* Not an extension header, most likely UDP
+ * encapsulation. Use return value as nexthdr
+ * protocol not nhoff (which presumably is
+ * not set by handler).
+ */
+ nexthdr = ret;
+ goto resubmit_final;
+ } else {
+ goto resubmit;
+ }
+ } else if (ret == 0) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
+ }
} else {
if (!raw) {
if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP6_INC_STATS_BH(net, idev,
- IPSTATS_MIB_INUNKNOWNPROTOS);
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INUNKNOWNPROTOS);
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_UNK_NEXTHDR, nhoff);
}
kfree_skb(skb);
} else {
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
@@ -278,7 +310,7 @@ resubmit:
return 0;
discard:
- IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
rcu_read_unlock();
kfree_skb(skb);
return 0;
@@ -297,7 +329,7 @@ int ip6_mc_input(struct sk_buff *skb)
const struct ipv6hdr *hdr;
bool deliver;
- IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev),
+ __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
skb->len);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 82e9f3076..22e90e56b 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -16,6 +16,7 @@
#include <net/protocol.h>
#include <net/ipv6.h>
+#include <net/inet_common.h>
#include "ip6_offload.h"
@@ -63,27 +64,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
int proto;
struct frag_hdr *fptr;
unsigned int unfrag_ip6hlen;
+ unsigned int payload_len;
u8 *prevhdr;
int offset = 0;
bool encap, udpfrag;
int nhoff;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_TCPV6 |
- 0)))
- goto out;
-
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
@@ -101,7 +87,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
if (skb->encapsulation &&
- skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
udpfrag = proto == IPPROTO_UDP && encap;
else
udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
@@ -117,7 +103,13 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
for (skb = segs; skb; skb = skb->next) {
ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
- ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
+ if (skb_is_gso(skb))
+ payload_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)(ipv6h + 1);
+ else
+ payload_len = skb->len - nhoff - sizeof(*ipv6h);
+ ipv6h->payload_len = htons(payload_len);
skb->network_header = (u8 *)ipv6h - skb->head;
if (udpfrag) {
@@ -239,10 +231,14 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
NAPI_GRO_CB(p)->flush |= flush;
- /* Clear flush_id, there's really no concept of ID in IPv6. */
- NAPI_GRO_CB(p)->flush_id = 0;
+ /* If the previous IP ID value was based on an atomic
+ * datagram we can overwrite the value and ignore it.
+ */
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ NAPI_GRO_CB(p)->flush_id = 0;
}
+ NAPI_GRO_CB(skb)->is_atomic = true;
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_postpull_rcsum(skb, iph, nlen);
@@ -258,9 +254,11 @@ out:
return pp;
}
-static struct sk_buff **sit_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
{
+ /* Common GRO receive for SIT and IP6IP6 */
+
if (NAPI_GRO_CB(skb)->encap_mark) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
@@ -271,6 +269,21 @@ static struct sk_buff **sit_gro_receive(struct sk_buff **head,
return ipv6_gro_receive(head, skb);
}
+static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ /* Common GRO receive for SIT and IP6IP6 */
+
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return inet_gro_receive(head, skb);
+}
+
static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
@@ -299,10 +312,24 @@ out_unlock:
static int sit_gro_complete(struct sk_buff *skb, int nhoff)
{
skb->encapsulation = 1;
- skb_shinfo(skb)->gso_type |= SKB_GSO_SIT;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+ return ipv6_gro_complete(skb, nhoff);
+}
+
+static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
return ipv6_gro_complete(skb, nhoff);
}
+static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+ return inet_gro_complete(skb, nhoff);
+}
+
static struct packet_offload ipv6_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.callbacks = {
@@ -315,24 +342,39 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
static const struct net_offload sit_offload = {
.callbacks = {
.gso_segment = ipv6_gso_segment,
- .gro_receive = sit_gro_receive,
+ .gro_receive = sit_ip6ip6_gro_receive,
.gro_complete = sit_gro_complete,
},
};
+static const struct net_offload ip4ip6_offload = {
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = ip4ip6_gro_receive,
+ .gro_complete = ip4ip6_gro_complete,
+ },
+};
+
+static const struct net_offload ip6ip6_offload = {
+ .callbacks = {
+ .gso_segment = ipv6_gso_segment,
+ .gro_receive = sit_ip6ip6_gro_receive,
+ .gro_complete = ip6ip6_gro_complete,
+ },
+};
static int __init ipv6_offload_init(void)
{
if (tcpv6_offload_init() < 0)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
- if (udp_offload_init() < 0)
- pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
if (ipv6_exthdrs_offload_init() < 0)
pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
dev_add_offload(&ipv6_packet_offload);
inet_add_offload(&sit_offload, IPPROTO_IPV6);
+ inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6);
+ inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP);
return 0;
}
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
index 2e155c651..96b40e41a 100644
--- a/net/ipv6/ip6_offload.h
+++ b/net/ipv6/ip6_offload.h
@@ -12,7 +12,8 @@
#define __ip6_offload_h
int ipv6_exthdrs_offload_init(void);
-int udp_offload_init(void);
+int udpv6_offload_init(void);
+int udpv6_offload_exit(void);
int tcpv6_offload_init(void);
#endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index da88de82b..635b8d340 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -395,8 +395,8 @@ int ip6_forward(struct sk_buff *skb)
goto drop;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -427,8 +427,8 @@ int ip6_forward(struct sk_buff *skb)
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -ETIMEDOUT;
@@ -441,15 +441,15 @@ int ip6_forward(struct sk_buff *skb)
if (proxied > 0)
return ip6_input(skb);
else if (proxied < 0) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INDISCARDS);
goto drop;
}
}
if (!xfrm6_route_forward(skb)) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INDISCARDS);
goto drop;
}
dst = skb_dst(skb);
@@ -505,17 +505,17 @@ int ip6_forward(struct sk_buff *skb)
/* Again, force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INTOOBIGERRORS);
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_FRAGFAILS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INTOOBIGERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
return -EMSGSIZE;
}
if (skb_cow(skb, dst->dev->hard_header_len)) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
- IPSTATS_MIB_OUTDISCARDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_OUTDISCARDS);
goto drop;
}
@@ -525,14 +525,14 @@ int ip6_forward(struct sk_buff *skb)
hdr->hop_limit--;
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
net, NULL, skb, skb->dev, dst->dev,
ip6_forward_finish);
error:
- IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
drop:
kfree_skb(skb);
return -EINVAL;
@@ -1177,12 +1177,12 @@ static void ip6_append_data_mtu(unsigned int *mtu,
}
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
- struct inet6_cork *v6_cork,
- int hlimit, int tclass, struct ipv6_txoptions *opt,
+ struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
struct rt6_info *rt, struct flowi6 *fl6)
{
struct ipv6_pinfo *np = inet6_sk(sk);
unsigned int mtu;
+ struct ipv6_txoptions *opt = ipc6->opt;
/*
* setup for corking
@@ -1224,8 +1224,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
dst_hold(&rt->dst);
cork->base.dst = &rt->dst;
cork->fl.u.ip6 = *fl6;
- v6_cork->hop_limit = hlimit;
- v6_cork->tclass = tclass;
+ v6_cork->hop_limit = ipc6->hlimit;
+ v6_cork->tclass = ipc6->tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
rt->dst.dev->mtu : dst_mtu(&rt->dst);
@@ -1253,7 +1253,8 @@ static int __ip6_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- unsigned int flags, int dontfrag)
+ unsigned int flags, struct ipcm6_cookie *ipc6,
+ const struct sockcm_cookie *sockc)
{
struct sk_buff *skb, *skb_prev = NULL;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
@@ -1292,7 +1293,7 @@ static int __ip6_append_data(struct sock *sk,
sizeof(struct frag_hdr) : 0) +
rt->rt6i_nfheader_len;
- if (cork->length + length > mtu - headersize && dontfrag &&
+ if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
(sk->sk_protocol == IPPROTO_UDP ||
sk->sk_protocol == IPPROTO_RAW)) {
ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
@@ -1324,7 +1325,7 @@ emsgsize:
csummode = CHECKSUM_PARTIAL;
if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
- sock_tx_timestamp(sk, &tx_flags);
+ sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
@@ -1558,9 +1559,10 @@ error:
int ip6_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen, int hlimit,
- int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags, int dontfrag)
+ void *from, int length, int transhdrlen,
+ struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
+ struct rt6_info *rt, unsigned int flags,
+ const struct sockcm_cookie *sockc)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1573,12 +1575,12 @@ int ip6_append_data(struct sock *sk,
/*
* setup for corking
*/
- err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
- tclass, opt, rt, fl6);
+ err = ip6_setup_cork(sk, &inet->cork, &np->cork,
+ ipc6, rt, fl6);
if (err)
return err;
- exthdrlen = (opt ? opt->opt_flen : 0);
+ exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
length += exthdrlen;
transhdrlen += exthdrlen;
} else {
@@ -1588,7 +1590,7 @@ int ip6_append_data(struct sock *sk,
return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
&np->cork, sk_page_frag(sk), getfrag,
- from, length, transhdrlen, flags, dontfrag);
+ from, length, transhdrlen, flags, ipc6, sockc);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
@@ -1744,15 +1746,14 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- int hlimit, int tclass,
- struct ipv6_txoptions *opt, struct flowi6 *fl6,
+ struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
- int dontfrag)
+ const struct sockcm_cookie *sockc)
{
struct inet_cork_full cork;
struct inet6_cork v6_cork;
struct sk_buff_head queue;
- int exthdrlen = (opt ? opt->opt_flen : 0);
+ int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
int err;
if (flags & MSG_PROBE)
@@ -1764,17 +1765,17 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
cork.base.addr = 0;
cork.base.opt = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
+ err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
if (err)
return ERR_PTR(err);
- if (dontfrag < 0)
- dontfrag = inet6_sk(sk)->dontfrag;
+ if (ipc6->dontfrag < 0)
+ ipc6->dontfrag = inet6_sk(sk)->dontfrag;
err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
- flags, dontfrag);
+ flags, ipc6, sockc);
if (err) {
__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
return ERR_PTR(err);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 1f20345cb..7b0481e37 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -238,6 +238,7 @@ static void ip6_dev_free(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
+ gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
@@ -753,97 +754,157 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
}
EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl);
-/**
- * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally
- * @skb: received socket buffer
- * @protocol: ethernet protocol ID
- * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN
- *
- * Return: 0
- **/
-
-static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
- __u8 ipproto,
- int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
- const struct ipv6hdr *ipv6h,
- struct sk_buff *skb))
+static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi,
+ struct metadata_dst *tun_dst,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb),
+ bool log_ecn_err)
{
- struct ip6_tnl *t;
+ struct pcpu_sw_netstats *tstats;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
- u8 tproto;
int err;
- rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
- if (t) {
- struct pcpu_sw_netstats *tstats;
+ if ((!(tpi->flags & TUNNEL_CSUM) &&
+ (tunnel->parms.i_flags & TUNNEL_CSUM)) ||
+ ((tpi->flags & TUNNEL_CSUM) &&
+ !(tunnel->parms.i_flags & TUNNEL_CSUM))) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
- tproto = ACCESS_ONCE(t->parms.proto);
- if (tproto != ipproto && tproto != 0) {
- rcu_read_unlock();
- goto discard;
+ if (tunnel->parms.i_flags & TUNNEL_SEQ) {
+ if (!(tpi->flags & TUNNEL_SEQ) ||
+ (tunnel->i_seqno &&
+ (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
}
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- rcu_read_unlock();
- goto discard;
- }
+ skb->protocol = tpi->proto;
- if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
- t->dev->stats.rx_dropped++;
- rcu_read_unlock();
- goto discard;
+ /* Warning: All skb pointers will be invalidated! */
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ if (!pskb_may_pull(skb, ETH_HLEN)) {
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
}
- skb->mac_header = skb->network_header;
- skb_reset_network_header(skb);
- skb->protocol = htons(protocol);
- memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
-
- __skb_tunnel_rx(skb, t->dev, t->net);
-
- err = dscp_ecn_decapsulate(t, ipv6h, skb);
- if (unlikely(err)) {
- if (log_ecn_error)
- net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n",
- &ipv6h->saddr,
- ipv6_get_dsfield(ipv6h));
- if (err > 1) {
- ++t->dev->stats.rx_frame_errors;
- ++t->dev->stats.rx_errors;
- rcu_read_unlock();
- goto discard;
- }
+
+ ipv6h = ipv6_hdr(skb);
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ }
+
+ skb_reset_network_header(skb);
+ memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+
+ __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+
+ err = dscp_ecn_decapsulate(tunnel, ipv6h, skb);
+ if (unlikely(err)) {
+ if (log_ecn_err)
+ net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n",
+ &ipv6h->saddr,
+ ipv6_get_dsfield(ipv6h));
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
}
+ }
- tstats = this_cpu_ptr(t->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
- netif_rx(skb);
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
- rcu_read_unlock();
- return 0;
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi,
+ struct metadata_dst *tun_dst,
+ bool log_ecn_err)
+{
+ return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate,
+ log_ecn_err);
+}
+EXPORT_SYMBOL(ip6_tnl_rcv);
+
+static const struct tnl_ptk_info tpi_v6 = {
+ /* no tunnel info required for ipxip6. */
+ .proto = htons(ETH_P_IPV6),
+};
+
+static const struct tnl_ptk_info tpi_v4 = {
+ /* no tunnel info required for ipxip6. */
+ .proto = htons(ETH_P_IP),
+};
+
+static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
+ const struct tnl_ptk_info *tpi,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb))
+{
+ struct ip6_tnl *t;
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ int ret = -1;
+
+ rcu_read_lock();
+ t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+
+ if (t) {
+ u8 tproto = ACCESS_ONCE(t->parms.proto);
+
+ if (tproto != ipproto && tproto != 0)
+ goto drop;
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr))
+ goto drop;
+ if (iptunnel_pull_header(skb, 0, tpi->proto, false))
+ goto drop;
+ ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+ log_ecn_error);
}
+
rcu_read_unlock();
- return 1;
-discard:
+ return ret;
+
+drop:
+ rcu_read_unlock();
kfree_skb(skb);
return 0;
}
static int ip4ip6_rcv(struct sk_buff *skb)
{
- return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP,
- ip4ip6_dscp_ecn_decapsulate);
+ return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4,
+ ip4ip6_dscp_ecn_decapsulate);
}
static int ip6ip6_rcv(struct sk_buff *skb)
{
- return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6,
- ip6ip6_dscp_ecn_decapsulate);
+ return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6,
+ ip6ip6_dscp_ecn_decapsulate);
}
struct ipv6_tel_txoption {
@@ -918,13 +979,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl);
/**
- * ip6_tnl_xmit2 - encapsulate packet and send
+ * ip6_tnl_xmit - encapsulate packet and send
* @skb: the outgoing socket buffer
* @dev: the outgoing tunnel device
* @dsfield: dscp code for outer header
- * @fl: flow of tunneled packet
+ * @fl6: flow of tunneled packet
* @encap_limit: encapsulation limit
* @pmtu: Path MTU is stored if packet is too big
+ * @proto: next header value
*
* Description:
* Build new header and do some sanity checks on the packet before sending
@@ -936,12 +998,9 @@ EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl);
* %-EMSGSIZE message too big. return mtu in this case.
**/
-static int ip6_tnl_xmit2(struct sk_buff *skb,
- struct net_device *dev,
- __u8 dsfield,
- struct flowi6 *fl6,
- int encap_limit,
- __u32 *pmtu)
+int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
+ struct flowi6 *fl6, int encap_limit, __u32 *pmtu,
+ __u8 proto)
{
struct ip6_tnl *t = netdev_priv(dev);
struct net *net = t->net;
@@ -951,8 +1010,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
struct dst_entry *dst = NULL, *ndst = NULL;
struct net_device *tdev;
int mtu;
- unsigned int max_headroom = sizeof(struct ipv6hdr);
- u8 proto;
+ unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
+ unsigned int max_headroom = psh_hlen;
int err = -1;
/* NBMA tunnel */
@@ -1005,7 +1064,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
t->parms.name);
goto tx_err_dst_release;
}
- mtu = dst_mtu(dst) - sizeof(*ipv6h);
+ mtu = dst_mtu(dst) - psh_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
@@ -1014,12 +1073,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
mtu = IPV6_MIN_MTU;
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
err = -EMSGSIZE;
goto tx_err_dst_release;
}
+ if (t->err_count > 0) {
+ if (time_before(jiffies,
+ t->err_time + IP6TUNNEL_ERR_TIMEO)) {
+ t->err_count--;
+
+ dst_link_failure(skb);
+ } else {
+ t->err_count = 0;
+ }
+ }
+
skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
/*
@@ -1045,18 +1115,22 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
skb_dst_set(skb, dst);
- skb->transport_header = skb->network_header;
-
- proto = fl6->flowi6_proto;
if (encap_limit >= 0) {
init_tel_txopt(&opt, encap_limit);
ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
}
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
+ /* Calculate max headroom for all the headers and adjust
+ * needed_headroom if necessary.
+ */
+ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr)
+ + dst->header_len + t->hlen;
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
+
+ err = ip6_tnl_encap(skb, t, &proto, fl6);
+ if (err)
+ return err;
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
@@ -1076,6 +1150,7 @@ tx_err_dst_release:
dst_release(dst);
return err;
}
+EXPORT_SYMBOL(ip6_tnl_xmit);
static inline int
ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -1099,7 +1174,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
dsfield = ipv4_get_dsfield(iph);
@@ -1109,7 +1183,13 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
- err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+ return -1;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
+ err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ IPPROTO_IPIP);
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
@@ -1153,7 +1233,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
@@ -1163,7 +1242,13 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
- err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+ return -1;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+
+ err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ IPPROTO_IPV6);
if (err != 0) {
if (err == -EMSGSIZE)
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
@@ -1174,7 +1259,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
}
static netdev_tx_t
-ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
struct net_device_stats *stats = &t->dev->stats;
@@ -1208,6 +1293,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct net_device *dev = t->dev;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
+ int t_hlen;
memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
@@ -1231,6 +1317,10 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
else
dev->flags &= ~IFF_POINTOPOINT;
+ t->tun_hlen = 0;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
if (p->flags & IP6_TNL_F_CAP_XMIT) {
int strict = (ipv6_addr_type(&p->raddr) &
(IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
@@ -1244,9 +1334,9 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
if (rt->dst.dev) {
dev->hard_header_len = rt->dst.dev->hard_header_len +
- sizeof(struct ipv6hdr);
+ t_hlen;
- dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr);
+ dev->mtu = rt->dst.dev->mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
@@ -1370,6 +1460,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
struct net *net = t->net;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ memset(&p1, 0, sizeof(p1));
+
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ip6n->fb_tnl_dev) {
@@ -1464,8 +1556,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
* %-EINVAL if mtu too small
**/
-static int
-ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
+int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
{
struct ip6_tnl *tnl = netdev_priv(dev);
@@ -1481,6 +1572,7 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
dev->mtu = new_mtu;
return 0;
}
+EXPORT_SYMBOL(ip6_tnl_change_mtu);
int ip6_tnl_get_iflink(const struct net_device *dev)
{
@@ -1490,16 +1582,74 @@ int ip6_tnl_get_iflink(const struct net_device *dev)
}
EXPORT_SYMBOL(ip6_tnl_get_iflink);
+int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip6_tnl_encap_ops **)
+ &ip6tun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_add_ops);
+
+int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip6_tnl_encap_ops **)
+ &ip6tun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_del_ops);
+
+int ip6_tnl_encap_setup(struct ip6_tnl *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip6_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+
static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_init = ip6_tnl_dev_init,
.ndo_uninit = ip6_tnl_dev_uninit,
- .ndo_start_xmit = ip6_tnl_xmit,
+ .ndo_start_xmit = ip6_tnl_start_xmit,
.ndo_do_ioctl = ip6_tnl_ioctl,
.ndo_change_mtu = ip6_tnl_change_mtu,
.ndo_get_stats = ip6_get_stats,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
+#define IPXIPX_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
/**
* ip6_tnl_dev_setup - setup virtual tunnel device
@@ -1511,20 +1661,18 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
- struct ip6_tnl *t;
-
dev->netdev_ops = &ip6_tnl_netdev_ops;
dev->destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr);
- t = netdev_priv(dev);
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
+ dev->features |= NETIF_F_LLTX;
netif_keep_dst(dev);
+
+ dev->features |= IPXIPX_FEATURES;
+ dev->hw_features |= IPXIPX_FEATURES;
+
/* This perm addr will be used as interface identifier by IPv6 */
dev->addr_assign_type = NET_ADDR_RANDOM;
eth_random_addr(dev->perm_addr);
@@ -1541,6 +1689,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
int ret;
+ int t_hlen;
t->dev = dev;
t->net = dev_net(dev);
@@ -1549,13 +1698,32 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
return -ENOMEM;
ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
- if (ret) {
- free_percpu(dev->tstats);
- dev->tstats = NULL;
- return ret;
- }
+ if (ret)
+ goto free_stats;
+
+ ret = gro_cells_init(&t->gro_cells, dev);
+ if (ret)
+ goto destroy_dst;
+
+ t->tun_hlen = 0;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
+ dev->type = ARPHRD_TUNNEL6;
+ dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
return 0;
+
+destroy_dst:
+ dst_cache_destroy(&t->dst_cache);
+free_stats:
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+
+ return ret;
}
/**
@@ -1643,13 +1811,55 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
}
+static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net *net = dev_net(dev);
struct ip6_tnl *nt, *t;
+ struct ip_tunnel_encap ipencap;
nt = netdev_priv(dev);
+
+ if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
ip6_tnl_netlink_parms(data, &nt->parms);
t = ip6_tnl_locate(net, &nt->parms, 0);
@@ -1666,10 +1876,17 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
struct __ip6_tnl_parm p;
struct net *net = t->net;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct ip_tunnel_encap ipencap;
if (dev == ip6n->fb_tnl_dev)
return -EINVAL;
+ if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
ip6_tnl_netlink_parms(data, &p);
t = ip6_tnl_locate(net, &p, 0);
@@ -1710,6 +1927,14 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_IPTUN_PROTO */
nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
0;
}
@@ -1727,6 +1952,17 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -1750,6 +1986,10 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 },
[IFLA_IPTUN_FLAGS] = { .type = NLA_U32 },
[IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ip6_link_ops __read_mostly = {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e207cb246..487ef3bc7 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1985,10 +1985,10 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTOCTETS, skb->len);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTFORWDATAGRAMS);
+ __IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTOCTETS, skb->len);
return dst_output(net, sk, skb);
}
@@ -2269,7 +2269,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
- if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0)
return -EMSGSIZE;
rtm->rtm_type = RTN_MULTICAST;
@@ -2412,7 +2412,7 @@ static int mr6_msgsize(bool unresolved, int maxvif)
+ nla_total_size(0) /* RTA_MULTIPATH */
+ maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
/* RTA_MFC_STATS */
- + nla_total_size(sizeof(struct rta_mfc_stats))
+ + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
;
return len;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4449ad1f8..a9895e15e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -407,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
break;
- opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ opt = rcu_dereference_protected(np->opt,
+ lockdep_sock_is_held(sk));
opt = ipv6_renew_options(sk, opt, optname,
(struct ipv6_opt_hdr __user *)optval,
optlen);
@@ -471,7 +472,8 @@ sticky_done:
struct ipv6_txoptions *opt = NULL;
struct msghdr msg;
struct flowi6 fl6;
- int junk;
+ struct sockcm_cookie sockc_junk;
+ struct ipcm6_cookie ipc6;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = sk->sk_bound_dev_if;
@@ -501,9 +503,9 @@ sticky_done:
msg.msg_controllen = optlen;
msg.msg_control = (void *)(opt+1);
+ ipc6.opt = opt;
- retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk,
- &junk, &junk);
+ retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk);
if (retv)
goto done;
update:
@@ -1123,7 +1125,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
struct ipv6_txoptions *opt;
lock_sock(sk);
- opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ opt = rcu_dereference_protected(np->opt,
+ lockdep_sock_is_held(sk));
len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
release_sock(sk);
/* check if ipv6_getsockopt_sticky() returns err code */
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9021b4355..63e06c3dd 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -39,34 +39,12 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv6 packet filter");
-/*#define DEBUG_IP_FIREWALL*/
-/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
-
-#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) pr_info(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) pr_info(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
#ifdef CONFIG_NETFILTER_DEBUG
#define IP_NF_ASSERT(x) WARN_ON(!(x))
#else
#define IP_NF_ASSERT(x)
#endif
-#if 0
-/* All the better to debug you with... */
-#define static
-#define inline
-#endif
-
void *ip6t_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(ip6t, IP6T);
@@ -100,35 +78,18 @@ ip6_packet_match(const struct sk_buff *skb,
if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
&ip6info->src), IP6T_INV_SRCIP) ||
FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
- &ip6info->dst), IP6T_INV_DSTIP)) {
- dprintf("Source or dest mismatch.\n");
-/*
- dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
- ipinfo->smsk.s_addr, ipinfo->src.s_addr,
- ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : "");
- dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
- ipinfo->dmsk.s_addr, ipinfo->dst.s_addr,
- ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/
+ &ip6info->dst), IP6T_INV_DSTIP))
return false;
- }
ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);
- if (FWINV(ret != 0, IP6T_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, ip6info->iniface,
- ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : "");
+ if (FWINV(ret != 0, IP6T_INV_VIA_IN))
return false;
- }
ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);
- if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, ip6info->outiface,
- ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : "");
+ if (FWINV(ret != 0, IP6T_INV_VIA_OUT))
return false;
- }
/* ... might want to do something with class and flowlabel here ... */
@@ -145,11 +106,6 @@ ip6_packet_match(const struct sk_buff *skb,
}
*fragoff = _frag_off;
- dprintf("Packet protocol %hi ?= %s%hi.\n",
- protohdr,
- ip6info->invflags & IP6T_INV_PROTO ? "!":"",
- ip6info->proto);
-
if (ip6info->proto == protohdr) {
if (ip6info->invflags & IP6T_INV_PROTO)
return false;
@@ -169,16 +125,11 @@ ip6_packet_match(const struct sk_buff *skb,
static bool
ip6_checkentry(const struct ip6t_ip6 *ipv6)
{
- if (ipv6->flags & ~IP6T_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- ipv6->flags & ~IP6T_F_MASK);
+ if (ipv6->flags & ~IP6T_F_MASK)
return false;
- }
- if (ipv6->invflags & ~IP6T_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- ipv6->invflags & ~IP6T_INV_MASK);
+ if (ipv6->invflags & ~IP6T_INV_MASK)
return false;
- }
+
return true;
}
@@ -446,13 +397,9 @@ ip6t_do_table(struct sk_buff *skb,
xt_write_recseq_end(addend);
local_bh_enable();
-#ifdef DEBUG_ALLOW_ALL
- return NF_ACCEPT;
-#else
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
-#endif
}
static bool find_jump_target(const struct xt_table_info *t,
@@ -492,11 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
= (void *)ip6t_get_target_c(e);
int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
- pr_err("iptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS))
return 0;
- }
+
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
@@ -508,26 +453,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
if ((strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("mark_source_chains: bad "
- "negative verdict (%i)\n",
- t->verdict);
+ t->verdict < -NF_MAX_VERDICT - 1)
return 0;
- }
/* Return: backtrack through the last
big jump. */
do {
e->comefrom ^= (1<<NF_INET_NUMHOOKS);
-#ifdef DEBUG_IP_FIREWALL_USER
- if (e->comefrom
- & (1 << NF_INET_NUMHOOKS)) {
- duprintf("Back unset "
- "on hook %u "
- "rule %u\n",
- hook, pos);
- }
-#endif
oldpos = pos;
pos = e->counters.pcnt;
e->counters.pcnt = 0;
@@ -554,16 +486,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
if (strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
- if (newpos > newinfo->size -
- sizeof(struct ip6t_entry)) {
- duprintf("mark_source_chains: "
- "bad verdict (%i)\n",
- newpos);
- return 0;
- }
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
e = (struct ip6t_entry *)
(entry0 + newpos);
if (!find_jump_target(newinfo, e))
@@ -580,8 +503,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
-next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
@@ -602,19 +524,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ip6t_ip6 *ipv6 = par->entryinfo;
- int ret;
par->match = m->u.kernel.match;
par->matchinfo = m->data;
- ret = xt_check_match(par, m->u.match_size - sizeof(*m),
- ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
- if (ret < 0) {
- duprintf("ip_tables: check failed for `%s'.\n",
- par.match->name);
- return ret;
- }
- return 0;
+ return xt_check_match(par, m->u.match_size - sizeof(*m),
+ ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
}
static int
@@ -625,10 +540,9 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
m->u.user.revision);
- if (IS_ERR(match)) {
- duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+ if (IS_ERR(match))
return PTR_ERR(match);
- }
+
m->u.kernel.match = match;
ret = check_match(m, par);
@@ -653,17 +567,11 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
.hook_mask = e->comefrom,
.family = NFPROTO_IPV6,
};
- int ret;
t = ip6t_get_target(e);
- ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
- e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO);
- if (ret < 0) {
- duprintf("ip_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- return ret;
- }
- return 0;
+ return xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ipv6.proto,
+ e->ipv6.invflags & IP6T_INV_PROTO);
}
static int
@@ -676,10 +584,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
+ unsigned long pcnt;
- e->counters.pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(e->counters.pcnt))
+ pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(pcnt))
return -ENOMEM;
+ e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -698,7 +608,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
ret = PTR_ERR(target);
goto cleanup_matches;
}
@@ -751,17 +660,12 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
(unsigned char *)e + sizeof(struct ip6t_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p\n", e);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
if (!ip6_checkentry(&e->ipv6))
return -EINVAL;
@@ -778,12 +682,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
- if (!check_underflow(e)) {
- pr_debug("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ if (!check_underflow(e))
return -EINVAL;
- }
+
newinfo->underflow[h] = underflows[h];
}
}
@@ -835,7 +736,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter, entry0, newinfo->size) {
@@ -852,27 +752,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
++newinfo->stacksize;
}
- if (i != repl->num_entries) {
- duprintf("translate_table: %u not %u entries\n",
- i, repl->num_entries);
+ if (i != repl->num_entries)
return -EINVAL;
- }
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(repl->valid_hooks & (1 << i)))
continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, repl->hook_entry[i]);
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF)
return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, repl->underflow[i]);
+ if (newinfo->underflow[i] == 0xFFFFFFFF)
return -EINVAL;
- }
}
if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
@@ -1100,11 +991,8 @@ static int get_info(struct net *net, void __user *user,
struct xt_table *t;
int ret;
- if (*len != sizeof(struct ip6t_getinfo)) {
- duprintf("length %u != %zu\n", *len,
- sizeof(struct ip6t_getinfo));
+ if (*len != sizeof(struct ip6t_getinfo))
return -EINVAL;
- }
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
@@ -1162,31 +1050,24 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
struct ip6t_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct ip6t_get_entries) + get.size) {
- duprintf("get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct ip6t_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET6, get.name);
if (!IS_ERR_OR_NULL(t)) {
struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else
ret = -EAGAIN;
- }
+
module_put(t->me);
xt_table_unlock(t);
} else
@@ -1222,8 +1103,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
/* You lied! */
if (valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
@@ -1233,8 +1112,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
if ((oldinfo->number > oldinfo->initial_entries) ||
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
@@ -1303,8 +1180,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("ip_tables: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
@@ -1429,11 +1304,9 @@ compat_find_calc_match(struct xt_entry_match *m,
match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
m->u.user.revision);
- if (IS_ERR(match)) {
- duprintf("compat_check_calc_match: `%s' not found\n",
- m->u.user.name);
+ if (IS_ERR(match))
return PTR_ERR(match);
- }
+
m->u.kernel.match = match;
*size += xt_compat_match_offset(match);
return 0;
@@ -1465,20 +1338,14 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
unsigned int j;
int ret, off;
- duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
(unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p, limit = %p\n", e, limit);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset < sizeof(struct compat_ip6t_entry) +
- sizeof(struct compat_xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ sizeof(struct compat_xt_entry_target))
return -EINVAL;
- }
if (!ip6_checkentry(&e->ipv6))
return -EINVAL;
@@ -1502,8 +1369,6 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
- t->u.user.name);
ret = PTR_ERR(target);
goto release_matches;
}
@@ -1582,7 +1447,6 @@ translate_compat_table(struct net *net,
size = compatr->size;
info->number = compatr->num_entries;
- duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET6);
xt_compat_init_offsets(AF_INET6, compatr->num_entries);
@@ -1597,11 +1461,8 @@ translate_compat_table(struct net *net,
}
ret = -EINVAL;
- if (j != compatr->num_entries) {
- duprintf("translate_compat_table: %u not %u entries\n",
- j, compatr->num_entries);
+ if (j != compatr->num_entries)
goto out_unlock;
- }
ret = -ENOMEM;
newinfo = xt_alloc_table_info(size);
@@ -1670,8 +1531,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -EFAULT;
/* overflow check */
- if (tmp.size >= INT_MAX / num_possible_cpus())
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
@@ -1694,8 +1553,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("compat_do_replace: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
@@ -1729,7 +1586,6 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
break;
default:
- duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1779,19 +1635,15 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
struct compat_ip6t_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct compat_ip6t_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET6);
@@ -1799,16 +1651,13 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n", private->number);
ret = compat_table_info(private, &info);
- if (!ret && get.size == info.size) {
+ if (!ret && get.size == info.size)
ret = compat_copy_entries_to_user(private->size,
t, uptr->entrytable);
- } else if (!ret) {
- duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else if (!ret)
ret = -EAGAIN;
- }
+
xt_compat_flush_offsets(AF_INET6);
module_put(t->me);
xt_table_unlock(t);
@@ -1861,7 +1710,6 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
break;
default:
- duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1913,7 +1761,6 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
}
default:
- duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -2015,7 +1862,6 @@ icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
/* We've been asked to examine this packet, and we
* can't. Hence, no choice but to drop.
*/
- duprintf("Dropping evil ICMP tinygram.\n");
par->hotdrop = true;
return false;
}
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 3deed5860..06bed74cf 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -20,15 +20,16 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
static struct ipv6hdr *
-synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr,
- const struct in6_addr *daddr)
+synproxy_build_ip(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
{
struct ipv6hdr *iph;
skb_reset_network_header(skb);
iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph));
ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = 64; //XXX
+ iph->hop_limit = net->ipv6.devconf_all->hop_limit;
iph->nexthdr = IPPROTO_TCP;
iph->saddr = *saddr;
iph->daddr = *daddr;
@@ -37,13 +38,12 @@ synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr,
}
static void
-synproxy_send_tcp(const struct synproxy_net *snet,
+synproxy_send_tcp(struct net *net,
const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct ipv6hdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
- struct net *net = nf_ct_net(snet->tmpl);
struct dst_entry *dst;
struct flowi6 fl6;
@@ -60,7 +60,7 @@ synproxy_send_tcp(const struct synproxy_net *snet,
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
- if (dst == NULL || dst->error) {
+ if (dst->error) {
dst_release(dst);
goto free_nskb;
}
@@ -84,7 +84,7 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct synproxy_net *snet,
+synproxy_send_client_synack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
@@ -103,7 +103,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr);
+ niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -121,15 +121,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_server_syn(const struct synproxy_net *snet,
+synproxy_send_server_syn(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts, u32 recv_seq)
{
+ struct synproxy_net *snet = synproxy_pernet(net);
struct sk_buff *nskb;
struct ipv6hdr *iph, *niph;
struct tcphdr *nth;
@@ -144,7 +145,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr);
+ niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -165,12 +166,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_server_ack(const struct synproxy_net *snet,
+synproxy_send_server_ack(struct net *net,
const struct ip_ct_tcp *state,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
@@ -189,7 +190,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr);
+ niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -205,11 +206,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_client_ack(const struct synproxy_net *snet,
+synproxy_send_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
@@ -227,7 +228,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr);
+ niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -243,15 +244,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
static bool
-synproxy_recv_client_ack(const struct synproxy_net *snet,
+synproxy_recv_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
struct synproxy_options *opts, u32 recv_seq)
{
+ struct synproxy_net *snet = synproxy_pernet(net);
int mss;
mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
@@ -267,7 +269,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
- synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ synproxy_send_server_syn(net, skb, th, opts, recv_seq);
return true;
}
@@ -275,7 +277,8 @@ static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(par->net);
+ struct net *net = par->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -304,12 +307,12 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(snet, skb, th, &opts);
+ synproxy_send_client_synack(net, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq));
return NF_DROP;
}
@@ -320,7 +323,8 @@ static unsigned int ipv6_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(nhs->net);
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -384,7 +388,7 @@ static unsigned int ipv6_synproxy_hook(void *priv,
* therefore we need to add 1 to make the SYN sequence
* number match the one of first SYN.
*/
- if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ if (synproxy_recv_client_ack(net, skb, th, &opts,
ntohl(th->seq) + 1))
this_cpu_inc(snet->stats->cookie_retrans);
@@ -410,12 +414,12 @@ static unsigned int ipv6_synproxy_hook(void *priv,
XT_SYNPROXY_OPT_SACK_PERM);
swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(snet, state, skb, th, &opts);
+ synproxy_send_server_ack(net, state, skb, th, &opts);
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(snet, skb, th, &opts);
+ synproxy_send_client_ack(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 6989c70ae..4a84b5ad9 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
fl6.daddr = *gw;
fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) |
(iph->flow_lbl[1] << 8) | iph->flow_lbl[2]);
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 4709f657b..a5400223f 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -158,7 +158,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.fl6_dport = otcph->source;
security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
- if (dst == NULL || dst->error) {
+ if (dst->error) {
dst_release(dst);
return;
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index c382db7a2..3ee3e444a 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -58,10 +58,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int iif = 0;
struct flowi6 fl6;
int err;
- int hlimit;
struct dst_entry *dst;
struct rt6_info *rt;
struct pingfakehdr pfh;
+ struct sockcm_cookie junk = {0};
+ struct ipcm6_cookie ipc6;
pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
@@ -138,13 +139,15 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.wcheck = 0;
pfh.family = AF_INET6;
- hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.tclass = np->tclass;
+ ipc6.dontfrag = np->dontfrag;
+ ipc6.opt = NULL;
lock_sock(sk);
err = ip6_append_data(sk, ping_getfrag, &pfh, len,
- 0, hlimit,
- np->tclass, NULL, &fl6, rt,
- MSG_DONTWAIT, np->dontfrag);
+ 0, &ipc6, &fl6, rt,
+ MSG_DONTWAIT, &junk);
if (err) {
ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index fa59dd7a4..896350df6 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -745,10 +745,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct dst_entry *dst = NULL;
struct raw6_frag_vec rfv;
struct flowi6 fl6;
+ struct sockcm_cookie sockc;
+ struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
- int hlimit = -1;
- int tclass = -1;
- int dontfrag = -1;
u16 proto;
int err;
@@ -769,6 +768,11 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_mark = sk->sk_mark;
+ ipc6.hlimit = -1;
+ ipc6.tclass = -1;
+ ipc6.dontfrag = -1;
+ ipc6.opt = NULL;
+
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -821,13 +825,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (fl6.flowi6_oif == 0)
fl6.flowi6_oif = sk->sk_bound_dev_if;
+ sockc.tsflags = sk->sk_tsflags;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
+ ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt,
- &hlimit, &tclass, &dontfrag);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -843,7 +848,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!opt) {
opt = txopt_get(np);
opt_to_free = opt;
- }
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -878,14 +883,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
err = PTR_ERR(dst);
goto out;
}
- if (hlimit < 0)
- hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (tclass < 0)
- tclass = np->tclass;
+ if (ipc6.tclass < 0)
+ ipc6.tclass = np->tclass;
- if (dontfrag < 0)
- dontfrag = np->dontfrag;
+ if (ipc6.dontfrag < 0)
+ ipc6.dontfrag = np->dontfrag;
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -894,10 +899,11 @@ back_from_confirm:
if (inet->hdrincl)
err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags);
else {
+ ipc6.opt = opt;
lock_sock(sk);
err = ip6_append_data(sk, raw6_getfrag, &rfv,
- len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst,
- msg->msg_flags, dontfrag);
+ len, 0, &ipc6, &fl6, (struct rt6_info *)dst,
+ msg->msg_flags, &sockc);
if (err)
ip6_flush_pending_frames(sk);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e2ea31175..2160d5d00 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -145,12 +145,12 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
if (!dev)
goto out_rcu_unlock;
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+ __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
if (inet_frag_evicting(&fq->q))
goto out_rcu_unlock;
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+ __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
/* Don't send error if the first segment did not arrive. */
if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
@@ -223,8 +223,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
if ((unsigned int)end > IPV6_MAXPLEN) {
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
((u8 *)&fhdr->frag_off -
skb_network_header(skb)));
@@ -258,8 +258,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
/* RFC2460 says always send parameter problem in
* this case. -DaveM
*/
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, payload_len));
return -1;
@@ -361,8 +361,8 @@ found:
discard_fq:
inet_frag_kill(&fq->q, &ip6_frags);
err:
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_REASMFAILS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -1;
}
@@ -500,7 +500,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
skb_network_header_len(head));
rcu_read_lock();
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+ __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
fq->q.fragments_tail = NULL;
@@ -513,7 +513,7 @@ out_oom:
net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
out_fail:
rcu_read_lock();
- IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+ __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
rcu_read_unlock();
return -1;
}
@@ -528,7 +528,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
goto fail_hdr;
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
/* Jumbo payload inhibits frag. header */
if (hdr->payload_len == 0)
@@ -544,8 +544,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
if (!(fhdr->frag_off & htons(0xFFF9))) {
/* It is not a fragmented frame */
skb->transport_header += sizeof(struct frag_hdr);
- IP6_INC_STATS_BH(net,
- ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
+ __IP6_INC_STATS(net,
+ ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
@@ -566,13 +566,13 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return ret;
}
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -1;
fail_hdr:
- IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
return -1;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6f32944e0..520b7884d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1190,7 +1190,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
struct dst_entry *dst;
bool any_src;
- dst = l3mdev_rt6_dst_by_oif(net, fl6);
+ dst = l3mdev_get_rt6_dst(net, fl6);
if (dst)
return dst;
@@ -1771,6 +1771,37 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
return -EINVAL;
}
+static struct rt6_info *ip6_nh_lookup_table(struct net *net,
+ struct fib6_config *cfg,
+ const struct in6_addr *gw_addr)
+{
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ .saddr = cfg->fc_prefsrc,
+ };
+ struct fib6_table *table;
+ struct rt6_info *rt;
+ int flags = RT6_LOOKUP_F_IFACE;
+
+ table = fib6_get_table(net, cfg->fc_table);
+ if (!table)
+ return NULL;
+
+ if (!ipv6_addr_any(&cfg->fc_prefsrc))
+ flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
+
+ /* if table lookup failed, fall back to full lookup */
+ if (rt == net->ipv6.ip6_null_entry) {
+ ip6_rt_put(rt);
+ rt = NULL;
+ }
+
+ return rt;
+}
+
static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
{
struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1942,7 +1973,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
rt->rt6i_gateway = *gw_addr;
if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
- struct rt6_info *grt;
+ struct rt6_info *grt = NULL;
/* IPv6 strictly inhibits using not link-local
addresses as nexthop address.
@@ -1954,7 +1985,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
- grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+ if (cfg->fc_table)
+ grt = ip6_nh_lookup_table(net, cfg, gw_addr);
+
+ if (!grt)
+ grt = rt6_lookup(net, gw_addr, NULL,
+ cfg->fc_ifindex, 1);
err = -EHOSTUNREACH;
if (!grt)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 6c53e4eb0..0619ac708 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -913,10 +913,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
goto tx_error;
}
- skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT);
- if (IS_ERR(skb)) {
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
ip_rt_put(rt);
- goto out;
+ goto tx_error;
}
if (df) {
@@ -992,7 +991,6 @@ tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
-out:
dev->stats.tx_errors++;
return NETDEV_TX_OK;
}
@@ -1002,15 +1000,15 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
- skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
- if (IS_ERR(skb))
- goto out;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
+ goto tx_error;
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP);
return NETDEV_TX_OK;
-out:
+tx_error:
+ kfree_skb(skb);
dev->stats.tx_errors++;
return NETDEV_TX_OK;
}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index aab91fa86..59c483937 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -155,11 +155,11 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie);
if (mss == 0) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 07c456fe0..90ea274a7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -235,7 +235,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
- opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
@@ -352,8 +352,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
skb->dev->ifindex);
if (!sk) {
- ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
return;
}
@@ -368,13 +368,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == TCP_CLOSE)
goto out;
if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
@@ -384,7 +384,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -455,7 +455,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- bool attach_req)
+ enum tcp_synack_type synack_type)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -468,7 +468,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc, attach_req);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
@@ -665,12 +665,12 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
return false;
if (hash_expected && !hash_location) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
@@ -754,7 +754,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, u32 label)
+ u8 tclass, __be32 label)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@@ -846,9 +846,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
- TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+ TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
return;
}
@@ -879,6 +879,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
return;
#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr);
@@ -896,16 +897,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
th->source, &ipv6h->daddr,
ntohs(th->source), tcp_v6_iif(skb));
if (!sk1)
- return;
+ goto out;
- rcu_read_lock();
key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr);
if (!key)
- goto release_sk1;
+ goto out;
genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
- goto release_sk1;
+ goto out;
}
#endif
@@ -919,18 +919,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
#ifdef CONFIG_TCP_MD5SIG
-release_sk1:
- if (sk1) {
- rcu_read_unlock();
- sock_put(sk1);
- }
+out:
+ rcu_read_unlock();
#endif
}
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
- u32 label)
+ __be32 label)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
tclass, label);
@@ -988,7 +985,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
&tcp_request_sock_ipv6_ops, sk, skb);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0; /* don't send reset */
}
@@ -1189,11 +1186,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
return newsk;
out_overflow:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
out_nonewsk:
dst_release(dst);
out:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return NULL;
}
@@ -1308,8 +1305,8 @@ discard:
kfree_skb(skb);
return 0;
csum_err:
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1380,6 +1377,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
{
const struct tcphdr *th;
const struct ipv6hdr *hdr;
+ bool refcounted;
struct sock *sk;
int ret;
struct net *net = dev_net(skb->dev);
@@ -1390,14 +1388,14 @@ static int tcp_v6_rcv(struct sk_buff *skb)
/*
* Count it even if it's bad.
*/
- TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
- th = tcp_hdr(skb);
+ th = (const struct tcphdr *)skb->data;
- if (th->doff < sizeof(struct tcphdr)/4)
+ if (unlikely(th->doff < sizeof(struct tcphdr)/4))
goto bad_packet;
if (!pskb_may_pull(skb, th->doff*4))
goto discard_it;
@@ -1405,12 +1403,13 @@ static int tcp_v6_rcv(struct sk_buff *skb)
if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo))
goto csum_error;
- th = tcp_hdr(skb);
+ th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
lookup:
sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
- th->source, th->dest, inet6_iif(skb));
+ th->source, th->dest, inet6_iif(skb),
+ &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1433,6 +1432,7 @@ process:
goto lookup;
}
sock_hold(sk);
+ refcounted = true;
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
@@ -1450,7 +1450,7 @@ process:
}
}
if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
@@ -1483,13 +1483,14 @@ process:
} else if (unlikely(sk_add_backlog(sk, skb,
sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
- NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
bh_unlock_sock(sk);
put_and_return:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret ? -1 : 0;
no_tcp_socket:
@@ -1500,9 +1501,9 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
- TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
- TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ __TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v6_send_reset(NULL, skb);
}
@@ -1512,7 +1513,9 @@ discard_it:
return 0;
discard_and_relse:
- sock_put(sk);
+ sk_drops_add(sk, skb);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
do_time_wait:
@@ -1543,6 +1546,7 @@ do_time_wait:
inet_twsk_deschedule_put(tw);
sk = sk2;
tcp_v6_restore_cb(skb);
+ refcounted = false;
goto process;
}
/* Fall through to ACK */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f96831d9d..42a2edf7c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -115,11 +115,10 @@ static void udp_v6_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- unsigned short hnum,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, __be16 dport,
- int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, unsigned short hnum,
+ int dif)
{
int score;
struct inet_sock *inet;
@@ -162,88 +161,36 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-static inline int compute_score2(struct sock *sk, struct net *net,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr,
- unsigned short hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- udp_sk(sk)->udp_port_hash != hnum ||
- sk->sk_family != PF_INET6)
- return -1;
-
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
-
- score = 0;
- inet = inet_sk(sk);
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score++;
- }
-
- if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
- return -1;
- score++;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score++;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
-begin:
result = NULL;
badness = -1;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -251,27 +198,10 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot2)
- goto begin;
-
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score2(result, net, saddr, sport,
- daddr, hnum, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
return result;
}
+/* rcu_read_lock() must be held */
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
@@ -279,15 +209,12 @@ struct sock *__udp6_lib_lookup(struct net *net,
struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
- rcu_read_lock();
if (hslot->count > 10) {
hash2 = udp6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -297,46 +224,43 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- &in6addr_any, hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
- rcu_read_unlock();
return result;
}
begin:
result = NULL;
badness = -1;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net, saddr, sport, daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -344,25 +268,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score(result, net, hnum, saddr, sport,
- daddr, dport, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
@@ -371,23 +276,46 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport,
struct udp_table *udptable)
{
- struct sock *sk;
const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct sock *sk;
sk = skb_steal_sock(skb);
if (unlikely(sk))
return sk;
- return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport,
+ return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
udptable, skb);
}
+struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ &iph->daddr, dport, inet6_iif(skb),
+ &udp_table, skb);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
- return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
+ struct sock *sk;
+
+ sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ dif, &udp_table, NULL);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ return sk;
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+#endif
/*
* This should be easy, if there is something there we
@@ -401,7 +329,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, off = 0;
+ int peeked, peeking, off;
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
@@ -415,15 +343,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
+ peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &off, &err);
if (!skb)
- goto out;
+ return err;
- ulen = skb->len - sizeof(struct udphdr);
+ ulen = skb->len;
copied = len;
- if (copied > ulen)
- copied = ulen;
+ if (copied > ulen - off)
+ copied = ulen - off;
else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
@@ -435,17 +364,16 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
}
if (checksum_valid || skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
- msg, copied);
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
else {
- err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
+ err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
}
@@ -454,23 +382,22 @@ try_again:
if (!peeked) {
atomic_inc(&sk->sk_drops);
if (is_udp4)
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INERRORS,
- is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+ is_udplite);
else
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INERRORS,
- is_udplite);
+ UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+ is_udplite);
}
- goto out_free;
+ skb_free_datagram_locked(sk, skb);
+ return err;
}
if (!peeked) {
if (is_udp4)
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INDATAGRAMS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
+ is_udplite);
else
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INDATAGRAMS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
+ is_udplite);
}
sock_recv_ts_and_drops(msg, sk, skb);
@@ -510,24 +437,22 @@ try_again:
if (flags & MSG_TRUNC)
err = ulen;
-out_free:
- skb_free_datagram_locked(sk, skb);
-out:
+ __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
slow = lock_sock_fast(sk);
if (!skb_kill_datagram(sk, skb, flags)) {
if (is_udp4) {
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
} else {
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_CSUMERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
}
}
unlock_sock_fast(sk, slow);
@@ -555,8 +480,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), udptable, skb);
if (!sk) {
- ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
return;
}
@@ -585,7 +510,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- sock_put(sk);
+ return;
}
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -598,15 +523,15 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_incoming_cpu_update(sk);
}
- rc = sock_queue_rcv_skb(sk, skb);
+ rc = __sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
- UDP6_INC_STATS_BH(sock_net(sk),
- UDP_MIB_RCVBUFERRORS, is_udplite);
- UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_RCVBUFERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
@@ -662,9 +587,9 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
ret = encap_rcv(sk, skb);
if (ret <= 0) {
- UDP_INC_STATS_BH(sock_net(sk),
- UDP_MIB_INDATAGRAMS,
- is_udplite);
+ __UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
return -ret;
}
}
@@ -689,14 +614,17 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (rcu_access_pointer(sk->sk_filter)) {
- if (udp_lib_checksum_complete(skb))
- goto csum_error;
- }
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ goto drop;
+ udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- UDP6_INC_STATS_BH(sock_net(sk),
- UDP_MIB_RCVBUFERRORS, is_udplite);
+ __UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_RCVBUFERRORS, is_udplite);
goto drop;
}
@@ -715,9 +643,9 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return rc;
csum_error:
- UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
- UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return -1;
@@ -747,33 +675,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
-static void flush_stack(struct sock **stack, unsigned int count,
- struct sk_buff *skb, unsigned int final)
-{
- struct sk_buff *skb1 = NULL;
- struct sock *sk;
- unsigned int i;
-
- for (i = 0; i < count; i++) {
- sk = stack[i];
- if (likely(!skb1))
- skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
- if (!skb1) {
- atomic_inc(&sk->sk_drops);
- UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- IS_UDPLITE(sk));
- UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
- }
-
- if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
- skb1 = NULL;
- sock_put(sk);
- }
- if (unlikely(skb1))
- kfree_skb(skb1);
-}
-
static void udp6_csum_zero_error(struct sk_buff *skb)
{
/* RFC 2460 section 8.1 says that we SHOULD log
@@ -792,15 +693,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
const struct in6_addr *saddr, const struct in6_addr *daddr,
struct udp_table *udptable, int proto)
{
- struct sock *sk, *stack[256 / sizeof(struct sock *)];
+ struct sock *sk, *first = NULL;
const struct udphdr *uh = udp_hdr(skb);
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(uh->dest);
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
- int dif = inet6_iif(skb);
- unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+ unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
- bool inner_flushed = false;
+ int dif = inet6_iif(skb);
+ struct hlist_node *node;
+ struct sk_buff *nskb;
if (use_hash2) {
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
@@ -811,27 +712,32 @@ start_lookup:
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
- spin_lock(&hslot->lock);
- sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
- if (__udp_v6_is_mcast_sock(net, sk,
- uh->dest, daddr,
- uh->source, saddr,
- dif, hnum) &&
- /* If zero checksum and no_check is not on for
- * the socket then skip it.
- */
- (uh->check || udp_sk(sk)->no_check6_rx)) {
- if (unlikely(count == ARRAY_SIZE(stack))) {
- flush_stack(stack, count, skb, ~0);
- inner_flushed = true;
- count = 0;
- }
- stack[count++] = sk;
- sock_hold(sk);
+ sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+ if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
+ uh->source, saddr, dif, hnum))
+ continue;
+ /* If zero checksum and no_check is not on for
+ * the socket then skip it.
+ */
+ if (!uh->check && !udp_sk(sk)->no_check6_rx)
+ continue;
+ if (!first) {
+ first = sk;
+ continue;
+ }
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!nskb)) {
+ atomic_inc(&sk->sk_drops);
+ __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ __UDP6_INC_STATS(net, UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ continue;
}
- }
- spin_unlock(&hslot->lock);
+ if (udpv6_queue_rcv_skb(sk, nskb) > 0)
+ consume_skb(nskb);
+ }
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
@@ -839,13 +745,13 @@ start_lookup:
goto start_lookup;
}
- if (count) {
- flush_stack(stack, count, skb, count - 1);
+ if (first) {
+ if (udpv6_queue_rcv_skb(first, skb) > 0)
+ consume_skb(skb);
} else {
- if (!inner_flushed)
- UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
- proto == IPPROTO_UDPLITE);
- consume_skb(skb);
+ kfree_skb(skb);
+ __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
}
return 0;
}
@@ -853,10 +759,10 @@ start_lookup:
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
+ const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
- struct sock *sk;
struct udphdr *uh;
- const struct in6_addr *saddr, *daddr;
+ struct sock *sk;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -910,7 +816,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int ret;
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
udp6_csum_zero_error(skb);
goto csum_error;
}
@@ -920,7 +825,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
ip6_compute_pseudo);
ret = udpv6_queue_rcv_skb(sk, skb);
- sock_put(sk);
/* a return value > 0 means to resubmit the input */
if (ret > 0)
@@ -940,7 +844,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp_lib_checksum_complete(skb))
goto csum_error;
- UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
kfree_skb(skb);
@@ -954,9 +858,9 @@ short_packet:
daddr, ntohs(uh->dest));
goto discard;
csum_error:
- UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+ __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
discard:
- UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
@@ -1068,13 +972,14 @@ send:
err = ip6_send_skb(skb);
if (err) {
if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
}
- } else
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_OUTDATAGRAMS, is_udplite);
+ } else {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
+ }
return err;
}
@@ -1118,16 +1023,19 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ip6_flowlabel *flowlabel = NULL;
struct flowi6 fl6;
struct dst_entry *dst;
+ struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
int ulen = len;
- int hlimit = -1;
- int tclass = -1;
- int dontfrag = -1;
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
int err;
int connected = 0;
int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sockcm_cookie sockc;
+
+ ipc6.hlimit = -1;
+ ipc6.tclass = -1;
+ ipc6.dontfrag = -1;
/* destination address check */
if (sin6) {
@@ -1247,14 +1155,15 @@ do_udp_sendmsg:
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
fl6.flowi6_mark = sk->sk_mark;
+ sockc.tsflags = sk->sk_tsflags;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(*opt);
+ ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt,
- &hlimit, &tclass, &dontfrag);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -1275,6 +1184,7 @@ do_udp_sendmsg:
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
+ ipc6.opt = opt;
fl6.flowi6_proto = sk->sk_protocol;
if (!ipv6_addr_any(daddr))
@@ -1304,11 +1214,11 @@ do_udp_sendmsg:
goto out;
}
- if (hlimit < 0)
- hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (tclass < 0)
- tclass = np->tclass;
+ if (ipc6.tclass < 0)
+ ipc6.tclass = np->tclass;
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -1319,9 +1229,9 @@ back_from_confirm:
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
- sizeof(struct udphdr), hlimit, tclass, opt,
+ sizeof(struct udphdr), &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, dontfrag);
+ msg->msg_flags, &sockc);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_v6_send_skb(skb, &fl6);
@@ -1342,13 +1252,12 @@ back_from_confirm:
up->pending = AF_INET6;
do_append_data:
- if (dontfrag < 0)
- dontfrag = np->dontfrag;
+ if (ipc6.dontfrag < 0)
+ ipc6.dontfrag = np->dontfrag;
up->len += ulen;
- err = ip6_append_data(sk, getfrag, msg, ulen,
- sizeof(struct udphdr), hlimit, tclass, opt, &fl6,
- (struct rt6_info *)dst,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag);
+ err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
+ &ipc6, &fl6, (struct rt6_info *)dst,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc);
if (err)
udp_v6_flush_pending_frames(sk);
else if (!corkreq)
@@ -1391,8 +1300,8 @@ out:
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- UDP6_INC_STATS_USER(sock_net(sk),
- UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2b0fbe692..ac858c480 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -36,19 +36,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type & ~(SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
@@ -153,7 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 1;
- return udp_gro_receive(head, skb, uh);
+ return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
flush:
NAPI_GRO_CB(skb)->flush = 1;
@@ -173,7 +160,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
- return udp_gro_complete(skb, nhoff);
+ return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
}
static const struct net_offload udpv6_offload = {
@@ -184,7 +171,12 @@ static const struct net_offload udpv6_offload = {
},
};
-int __init udp_offload_init(void)
+int udpv6_offload_init(void)
{
return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
}
+
+int udpv6_offload_exit(void)
+{
+ return inet6_del_offload(&udpv6_offload, IPPROTO_UDP);
+}
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 923abd6b3..8d2f7c9b4 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1024,8 +1024,11 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
}
/* Check if we have opened a local TSAP */
- if (!self->tsap)
- irda_open_tsap(self, LSAP_ANY, addr->sir_name);
+ if (!self->tsap) {
+ err = irda_open_tsap(self, LSAP_ANY, addr->sir_name);
+ if (err)
+ goto out;
+ }
/* Move to connecting socket, start sending Connect Requests */
sock->state = SS_CONNECTING;
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index da126ee6d..873c4b707 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -220,10 +220,11 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
/* Check if already open */
- if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) {
+ if (tty_port_initialized(&self->port)) {
pr_debug("%s(), already open so break out!\n", __func__);
return 0;
}
+ tty_port_set_initialized(&self->port, 1);
/* Register with IrCOMM */
irda_notify_init(&notify);
@@ -257,7 +258,7 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
return 0;
err:
- clear_bit(ASYNCB_INITIALIZED, &self->port.flags);
+ tty_port_set_initialized(&self->port, 0);
return ret;
}
@@ -280,8 +281,8 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
* If non-blocking mode is set, or the port is not enabled,
* then make the check up front and then exit.
*/
- if (test_bit(TTY_IO_ERROR, &tty->flags)) {
- port->flags |= ASYNC_NORMAL_ACTIVE;
+ if (tty_io_error(tty)) {
+ tty_port_set_active(port, 1);
return 0;
}
@@ -289,7 +290,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
/* nonblock mode is set */
if (C_BAUD(tty))
tty_port_raise_dtr_rts(port);
- port->flags |= ASYNC_NORMAL_ACTIVE;
+ tty_port_set_active(port, 1);
pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
return 0;
}
@@ -318,13 +319,12 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
spin_unlock_irqrestore(&port->lock, flags);
while (1) {
- if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+ if (C_BAUD(tty) && tty_port_initialized(port))
tty_port_raise_dtr_rts(port);
set_current_state(TASK_INTERRUPTIBLE);
- if (tty_hung_up_p(filp) ||
- !test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
retval = (port->flags & ASYNC_HUP_NOTIFY) ?
-EAGAIN : -ERESTARTSYS;
break;
@@ -365,7 +365,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
__FILE__, __LINE__, tty->driver->name, port->count);
if (!retval)
- port->flags |= ASYNC_NORMAL_ACTIVE;
+ tty_port_set_active(port, 1);
return retval;
}
@@ -876,8 +876,9 @@ static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
IRDA_ASSERT(self != NULL, return;);
IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
- if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags))
+ if (!tty_port_initialized(&self->port))
return;
+ tty_port_set_initialized(&self->port, 0);
ircomm_tty_detach_cable(self);
@@ -925,7 +926,6 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
ircomm_tty_shutdown(self);
spin_lock_irqsave(&port->lock, flags);
- port->flags &= ~ASYNC_NORMAL_ACTIVE;
if (port->tty) {
set_bit(TTY_IO_ERROR, &port->tty->flags);
tty_kref_put(port->tty);
@@ -933,6 +933,7 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
port->tty = NULL;
port->count = 0;
spin_unlock_irqrestore(&port->lock, flags);
+ tty_port_set_active(port, 0);
wake_up_interruptible(&port->open_wait);
}
@@ -999,7 +1000,7 @@ void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
if (status & IRCOMM_DCE_DELTA_ANY) {
/*wake_up_interruptible(&self->delta_msr_wait);*/
}
- if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+ if (tty_port_check_carrier(&self->port) && (status & IRCOMM_DELTA_CD)) {
pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line,
(status & IRCOMM_CD) ? "on" : "off");
@@ -1255,11 +1256,11 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
seq_printf(m, "%cASYNC_CTS_FLOW", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_CHECK_CD) {
+ if (tty_port_check_carrier(&self->port)) {
seq_printf(m, "%cASYNC_CHECK_CD", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_INITIALIZED) {
+ if (tty_port_initialized(&self->port)) {
seq_printf(m, "%cASYNC_INITIALIZED", sep);
sep = '|';
}
@@ -1267,7 +1268,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
+ if (tty_port_active(&self->port)) {
seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
sep = '|';
}
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index 61137f8b5..0a411019c 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -968,7 +968,7 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
ircomm_tty_start_watchdog_timer(self, 3*HZ);
- if (self->port.flags & ASYNC_CHECK_CD) {
+ if (tty_port_check_carrier(&self->port)) {
/* Drop carrier */
self->settings.dce = IRCOMM_DELTA_CD;
ircomm_tty_check_modem_status(self);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index d3687aaa2..d4fdf8f7b 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -86,21 +86,17 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
/* CTS flow control flag and modem status interrupts */
+ tty_port_set_cts_flow(&self->port, cflag & CRTSCTS);
if (cflag & CRTSCTS) {
- self->port.flags |= ASYNC_CTS_FLOW;
self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
/* This got me. Bummer. Jean II */
if (self->service_type == IRCOMM_3_WIRE_RAW)
net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n",
__func__);
} else {
- self->port.flags &= ~ASYNC_CTS_FLOW;
self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
}
- if (cflag & CLOCAL)
- self->port.flags &= ~ASYNC_CHECK_CD;
- else
- self->port.flags |= ASYNC_CHECK_CD;
+ tty_port_set_check_carrier(&self->port, ~cflag & CLOCAL);
#if 0
/*
* Set up parity check flag
@@ -166,7 +162,7 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
/* Handle transition away from B0 status */
if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
self->settings.dte |= IRCOMM_DTR;
- if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+ if (!C_CRTSCTS(tty) || !tty_throttled(tty))
self->settings.dte |= IRCOMM_RTS;
ircomm_param_request(self, IRCOMM_DTE, TRUE);
}
@@ -190,7 +186,7 @@ int ircomm_tty_tiocmget(struct tty_struct *tty)
struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
unsigned int result;
- if (tty->flags & (1 << TTY_IO_ERROR))
+ if (tty_io_error(tty))
return -EIO;
result = ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
@@ -213,7 +209,7 @@ int ircomm_tty_tiocmset(struct tty_struct *tty,
{
struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
- if (tty->flags & (1 << TTY_IO_ERROR))
+ if (tty_io_error(tty))
return -EIO;
IRDA_ASSERT(self != NULL, return -1;);
@@ -328,7 +324,7 @@ static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
check_and_exit:
- if (self->flags & ASYNC_INITIALIZED) {
+ if (tty_port_initialized(self)) {
if (((old_state.flags & ASYNC_SPD_MASK) !=
(self->flags & ASYNC_SPD_MASK)) ||
(old_driver.custom_divisor != driver->custom_divisor)) {
@@ -362,7 +358,7 @@ int ircomm_tty_ioctl(struct tty_struct *tty,
if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
(cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
(cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
- if (tty->flags & (1 << TTY_IO_ERROR))
+ if (tty_io_error(tty))
return -EIO;
}
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index fcfbe5794..d8b726728 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -181,7 +181,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
skb = new_skb;
}
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
len = skb->len;
/* Now queue the packet in the transport layer */
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 40662d732..0b68ba730 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1483,7 +1483,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
long timeo;
struct kcm_rx_msg *rxm;
int err = 0;
- size_t copied;
+ ssize_t copied;
struct sk_buff *skb;
/* Only support splice for SOCKSEQPACKET */
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index cd479903d..6c54e03fe 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -128,6 +128,7 @@ static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
*/
static int l2tp_ip6_recv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
struct sock *sk;
u32 session_id;
u32 tunnel_id;
@@ -154,7 +155,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_find(&init_net, NULL, session_id);
+ session = l2tp_session_find(net, NULL, session_id);
if (session == NULL)
goto discard;
@@ -188,14 +189,14 @@ pass_up:
goto discard;
tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
- tunnel = l2tp_tunnel_find(&init_net, tunnel_id);
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
if (tunnel != NULL)
sk = tunnel->sock;
else {
struct ipv6hdr *iph = ipv6_hdr(skb);
read_lock_bh(&l2tp_ip6_lock);
- sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr,
+ sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
0, tunnel_id);
read_unlock_bh(&l2tp_ip6_lock);
}
@@ -263,6 +264,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
+ struct net *net = sock_net(sk);
__be32 v4addr = 0;
int addr_type;
int err;
@@ -286,7 +288,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
err = -EADDRINUSE;
read_lock_bh(&l2tp_ip6_lock);
- if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr,
+ if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
sk->sk_bound_dev_if, addr->l2tp_conn_id))
goto out_in_use;
read_unlock_bh(&l2tp_ip6_lock);
@@ -456,7 +458,7 @@ static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb)
return 0;
drop:
- IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
kfree_skb(skb);
return -1;
}
@@ -494,10 +496,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
struct flowi6 fl6;
+ struct sockcm_cookie sockc_unused = {0};
+ struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
- int hlimit = -1;
- int tclass = -1;
- int dontfrag = -1;
int transhdrlen = 4; /* zero session-id */
int ulen = len + transhdrlen;
int err;
@@ -519,6 +520,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_mark = sk->sk_mark;
+ ipc6.hlimit = -1;
+ ipc6.tclass = -1;
+ ipc6.dontfrag = -1;
+
if (lsa) {
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -563,9 +568,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
+ ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt,
- &hlimit, &tclass, &dontfrag);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6,
+ &sockc_unused);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -586,6 +592,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
+ ipc6.opt = opt;
fl6.flowi6_proto = sk->sk_protocol;
if (!ipv6_addr_any(daddr))
@@ -610,14 +617,14 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto out;
}
- if (hlimit < 0)
- hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (tclass < 0)
- tclass = np->tclass;
+ if (ipc6.tclass < 0)
+ ipc6.tclass = np->tclass;
- if (dontfrag < 0)
- dontfrag = np->dontfrag;
+ if (ipc6.dontfrag < 0)
+ ipc6.dontfrag = np->dontfrag;
if (msg->msg_flags & MSG_CONFIRM)
goto do_confirm;
@@ -625,9 +632,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
lock_sock(sk);
err = ip6_append_data(sk, ip_generic_getfrag, msg,
- ulen, transhdrlen, hlimit, tclass, opt,
+ ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, dontfrag);
+ msg->msg_flags, &sockc_unused);
if (err)
ip6_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE))
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 2caaa84ce..1d02e8d20 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -346,22 +346,30 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS,
- atomic_long_read(&tunnel->stats.tx_packets)) ||
- nla_put_u64(skb, L2TP_ATTR_TX_BYTES,
- atomic_long_read(&tunnel->stats.tx_bytes)) ||
- nla_put_u64(skb, L2TP_ATTR_TX_ERRORS,
- atomic_long_read(&tunnel->stats.tx_errors)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_PACKETS,
- atomic_long_read(&tunnel->stats.rx_packets)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_BYTES,
- atomic_long_read(&tunnel->stats.rx_bytes)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
- atomic_long_read(&tunnel->stats.rx_seq_discards)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS,
- atomic_long_read(&tunnel->stats.rx_oos_packets)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_ERRORS,
- atomic_long_read(&tunnel->stats.rx_errors)))
+ if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS,
+ atomic_long_read(&tunnel->stats.tx_packets),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES,
+ atomic_long_read(&tunnel->stats.tx_bytes),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS,
+ atomic_long_read(&tunnel->stats.tx_errors),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS,
+ atomic_long_read(&tunnel->stats.rx_packets),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES,
+ atomic_long_read(&tunnel->stats.rx_bytes),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
+ atomic_long_read(&tunnel->stats.rx_seq_discards),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS,
+ atomic_long_read(&tunnel->stats.rx_oos_packets),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS,
+ atomic_long_read(&tunnel->stats.rx_errors),
+ L2TP_ATTR_STATS_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -746,29 +754,38 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) ||
#endif
(session->reorder_timeout &&
- nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout)))
+ nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT,
+ session->reorder_timeout, L2TP_ATTR_PAD)))
goto nla_put_failure;
nest = nla_nest_start(skb, L2TP_ATTR_STATS);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS,
- atomic_long_read(&session->stats.tx_packets)) ||
- nla_put_u64(skb, L2TP_ATTR_TX_BYTES,
- atomic_long_read(&session->stats.tx_bytes)) ||
- nla_put_u64(skb, L2TP_ATTR_TX_ERRORS,
- atomic_long_read(&session->stats.tx_errors)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_PACKETS,
- atomic_long_read(&session->stats.rx_packets)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_BYTES,
- atomic_long_read(&session->stats.rx_bytes)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
- atomic_long_read(&session->stats.rx_seq_discards)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS,
- atomic_long_read(&session->stats.rx_oos_packets)) ||
- nla_put_u64(skb, L2TP_ATTR_RX_ERRORS,
- atomic_long_read(&session->stats.rx_errors)))
+ if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS,
+ atomic_long_read(&session->stats.tx_packets),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES,
+ atomic_long_read(&session->stats.tx_bytes),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS,
+ atomic_long_read(&session->stats.tx_errors),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS,
+ atomic_long_read(&session->stats.rx_packets),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES,
+ atomic_long_read(&session->stats.rx_bytes),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
+ atomic_long_read(&session->stats.rx_seq_discards),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS,
+ atomic_long_read(&session->stats.rx_oos_packets),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS,
+ atomic_long_read(&session->stats.rx_errors),
+ L2TP_ATTR_STATS_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest);
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index e925037fa..6651a78e1 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -97,3 +97,66 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
return tb_id;
}
EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
+
+/**
+ * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns
+ * cached route for L3 master device if relevant
+ * to flow
+ * @net: network namespace for device index lookup
+ * @fl6: IPv6 flow struct for lookup
+ */
+
+struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
+ const struct flowi6 *fl6)
+{
+ struct dst_entry *dst = NULL;
+ struct net_device *dev;
+
+ if (fl6->flowi6_oif) {
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+ if (dev && netif_is_l3_slave(dev))
+ dev = netdev_master_upper_dev_get_rcu(dev);
+
+ if (dev && netif_is_l3_master(dev) &&
+ dev->l3mdev_ops->l3mdev_get_rt6_dst)
+ dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+
+ rcu_read_unlock();
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst);
+
+/**
+ * l3mdev_get_saddr - get source address for a flow based on an interface
+ * enslaved to an L3 master device
+ * @net: network namespace for device index lookup
+ * @ifindex: Interface index
+ * @fl4: IPv4 flow struct
+ */
+
+int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
+{
+ struct net_device *dev;
+ int rc = 0;
+
+ if (ifindex) {
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev && netif_is_l3_slave(dev))
+ dev = netdev_master_upper_dev_get_rcu(dev);
+
+ if (dev && netif_is_l3_master(dev) &&
+ dev->l3mdev_ops->l3mdev_get_saddr)
+ rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
+
+ rcu_read_unlock();
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 5dba89913..182470847 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -444,10 +444,9 @@ static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb,
break;
case LAPB_FRMR:
- lapb_dbg(1, "(%p) S3 RX FRMR(%d) %02X %02X %02X %02X %02X\n",
+ lapb_dbg(1, "(%p) S3 RX FRMR(%d) %5ph\n",
lapb->dev, frame->pf,
- skb->data[0], skb->data[1], skb->data[2],
- skb->data[3], skb->data[4]);
+ skb->data);
lapb_establish_data_link(lapb);
lapb_dbg(0, "(%p) S3 -> S1\n", lapb->dev);
lapb_requeue_frames(lapb);
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index ba4d015bd..482c94d9d 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -148,9 +148,7 @@ void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type)
}
}
- lapb_dbg(2, "(%p) S%d TX %02X %02X %02X\n",
- lapb->dev, lapb->state,
- skb->data[0], skb->data[1], skb->data[2]);
+ lapb_dbg(2, "(%p) S%d TX %3ph\n", lapb->dev, lapb->state, skb->data);
if (!lapb_data_transmit(lapb, skb))
kfree_skb(skb);
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 9d0a426ec..3c1914df6 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -113,9 +113,7 @@ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb,
{
frame->type = LAPB_ILLEGAL;
- lapb_dbg(2, "(%p) S%d RX %02X %02X %02X\n",
- lapb->dev, lapb->state,
- skb->data[0], skb->data[1], skb->data[2]);
+ lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data);
/* We always need to look at 2 bytes, sometimes we need
* to look at 3 and those cases are handled below.
@@ -284,10 +282,9 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
dptr++;
*dptr++ = lapb->frmr_type;
- lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X %02X %02X\n",
+ lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n",
lapb->dev, lapb->state,
- skb->data[1], skb->data[2], skb->data[3],
- skb->data[4], skb->data[5]);
+ &skb->data[1]);
} else {
dptr = skb_put(skb, 4);
*dptr++ = LAPB_FRMR;
@@ -299,9 +296,8 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
dptr++;
*dptr++ = lapb->frmr_type;
- lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X\n",
- lapb->dev, lapb->state, skb->data[1],
- skb->data[2], skb->data[3]);
+ lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n",
+ lapb->dev, lapb->state, &skb->data[1]);
}
lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE);
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 1a3c7e0f5..29c509c54 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -195,7 +195,7 @@ static int llc_seq_core_show(struct seq_file *seq, void *v)
timer_pending(&llc->pf_cycle_timer.timer),
timer_pending(&llc->rej_sent_timer.timer),
timer_pending(&llc->busy_state_timer.timer),
- !!sk->sk_backlog.tail, !!sock_owned_by_user(sk));
+ !!sk->sk_backlog.tail, !!sk->sk_lock.owned);
out:
return 0;
}
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f24..42fa81031 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
size_t len)
{
struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_txq *txq;
u16 capab, tid;
u8 buf_size;
bool amsdu;
@@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
+ txq = sta->sta.txq[tid];
+ if (!amsdu && txq)
+ set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
mutex_lock(&sta->ampdu_mlme.mtx);
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index fe1704c4e..0c12e4001 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -65,11 +65,13 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
return ret;
if (type == NL80211_IFTYPE_AP_VLAN &&
- params && params->use_4addr == 0)
+ params && params->use_4addr == 0) {
RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
- else if (type == NL80211_IFTYPE_STATION &&
- params && params->use_4addr >= 0)
+ ieee80211_check_fast_rx_iface(sdata);
+ } else if (type == NL80211_IFTYPE_STATION &&
+ params && params->use_4addr >= 0) {
sdata->u.mgd.use_4addr = params->use_4addr;
+ }
if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
struct ieee80211_local *local = sdata->local;
@@ -732,6 +734,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.beacon_int = params->beacon_interval;
sdata->vif.bss_conf.dtim_period = params->dtim_period;
sdata->vif.bss_conf.enable_beacon = true;
+ sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
sdata->vif.bss_conf.ssid_len = params->ssid_len;
if (params->ssid_len)
@@ -1046,7 +1049,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
int ret = 0;
struct ieee80211_supported_band *sband;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
u32 mask, set;
sband = local->hw.wiphy->bands[band];
@@ -1202,6 +1205,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
params->opmode_notif, band);
}
+ if (params->support_p2p_ps >= 0)
+ sta->sta.support_p2p_ps = params->support_p2p_ps;
+
if (ieee80211_vif_is_mesh(&sdata->vif))
sta_apply_mesh_params(local, sta, params);
@@ -1363,6 +1369,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
new_4addr = true;
+ __ieee80211_check_fast_rx_iface(vlansdata);
}
if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -1499,7 +1506,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
memset(pinfo, 0, sizeof(*pinfo));
- pinfo->generation = mesh_paths_generation;
+ pinfo->generation = mpath->sdata->u.mesh.mesh_paths_generation;
pinfo->filled = MPATH_INFO_FRAME_QLEN |
MPATH_INFO_SN |
@@ -1577,7 +1584,7 @@ static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp,
memset(pinfo, 0, sizeof(*pinfo));
memcpy(mpp, mpath->mpp, ETH_ALEN);
- pinfo->generation = mpp_paths_generation;
+ pinfo->generation = mpath->sdata->u.mesh.mpp_paths_generation;
}
static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev,
@@ -1841,7 +1848,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
struct bss_parameters *params)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- enum ieee80211_band band;
+ enum nl80211_band band;
u32 changed = 0;
if (!sdata_dereference(sdata->u.ap.beacon, sdata))
@@ -1860,7 +1867,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
}
if (!sdata->vif.bss_conf.use_short_slot &&
- band == IEEE80211_BAND_5GHZ) {
+ band == NL80211_BAND_5GHZ) {
sdata->vif.bss_conf.use_short_slot = true;
changed |= BSS_CHANGED_ERP_SLOT;
}
@@ -1885,6 +1892,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
else
sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+ ieee80211_check_fast_rx_iface(sdata);
}
if (params->ht_opmode >= 0) {
@@ -2089,12 +2097,12 @@ static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev)
}
static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
- int rate[IEEE80211_NUM_BANDS])
+ int rate[NUM_NL80211_BANDS])
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
memcpy(sdata->vif.bss_conf.mcast_rate, rate,
- sizeof(int) * IEEE80211_NUM_BANDS);
+ sizeof(int) * NUM_NL80211_BANDS);
return 0;
}
@@ -2499,7 +2507,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
return ret;
}
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
struct ieee80211_supported_band *sband = wiphy->bands[i];
int j;
@@ -3127,7 +3135,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
struct ieee80211_tx_info *info;
struct sta_info *sta;
struct ieee80211_chanctx_conf *chanctx_conf;
- enum ieee80211_band band;
+ enum nl80211_band band;
int ret;
/* the lock is needed to assign the cookie later */
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 4ab5c522c..b251b2f7f 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -127,6 +127,9 @@ static const char *hw_flag_names[] = {
FLAG(BEACON_TX_STATUS),
FLAG(NEEDS_UNIQUE_STA_ADDR),
FLAG(SUPPORTS_REORDERING_BUFFER),
+ FLAG(USES_RSS),
+ FLAG(TX_AMSDU),
+ FLAG(TX_FRAG_LIST),
#undef FLAG
};
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 37ea30e07..a5ba739cd 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -169,21 +169,21 @@ static ssize_t ieee80211_if_write_##name(struct file *file, \
IEEE80211_IF_FILE_R(name)
/* common attributes */
-IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ],
+IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[NL80211_BAND_2GHZ],
HEX);
-IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ],
+IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[NL80211_BAND_5GHZ],
HEX);
IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz,
- rc_rateidx_mcs_mask[IEEE80211_BAND_2GHZ], HEXARRAY);
+ rc_rateidx_mcs_mask[NL80211_BAND_2GHZ], HEXARRAY);
IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz,
- rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY);
+ rc_rateidx_mcs_mask[NL80211_BAND_5GHZ], HEXARRAY);
static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_2ghz(
const struct ieee80211_sub_if_data *sdata,
char *buf, int buflen)
{
int i, len = 0;
- const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_2GHZ];
+ const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[NL80211_BAND_2GHZ];
for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]);
@@ -199,7 +199,7 @@ static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_5ghz(
char *buf, int buflen)
{
int i, len = 0;
- const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_5GHZ];
+ const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[NL80211_BAND_5GHZ];
for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]);
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index a39512f09..33dfcbc2b 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -3,6 +3,7 @@
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -51,31 +52,54 @@ static const struct file_operations sta_ ##name## _ops = { \
STA_FILE(aid, sta.aid, D);
+static const char * const sta_flag_names[] = {
+#define FLAG(F) [WLAN_STA_##F] = #F
+ FLAG(AUTH),
+ FLAG(ASSOC),
+ FLAG(PS_STA),
+ FLAG(AUTHORIZED),
+ FLAG(SHORT_PREAMBLE),
+ FLAG(WDS),
+ FLAG(CLEAR_PS_FILT),
+ FLAG(MFP),
+ FLAG(BLOCK_BA),
+ FLAG(PS_DRIVER),
+ FLAG(PSPOLL),
+ FLAG(TDLS_PEER),
+ FLAG(TDLS_PEER_AUTH),
+ FLAG(TDLS_INITIATOR),
+ FLAG(TDLS_CHAN_SWITCH),
+ FLAG(TDLS_OFF_CHANNEL),
+ FLAG(TDLS_WIDER_BW),
+ FLAG(UAPSD),
+ FLAG(SP),
+ FLAG(4ADDR_EVENT),
+ FLAG(INSERTED),
+ FLAG(RATE_CONTROL),
+ FLAG(TOFFSET_KNOWN),
+ FLAG(MPSP_OWNER),
+ FLAG(MPSP_RECIPIENT),
+ FLAG(PS_DELIVER),
+#undef FLAG
+};
+
static ssize_t sta_flags_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
- char buf[121];
+ char buf[16 * NUM_WLAN_STA_FLAGS], *pos = buf;
+ char *end = buf + sizeof(buf) - 1;
struct sta_info *sta = file->private_data;
+ unsigned int flg;
+
+ BUILD_BUG_ON(ARRAY_SIZE(sta_flag_names) != NUM_WLAN_STA_FLAGS);
+
+ for (flg = 0; flg < NUM_WLAN_STA_FLAGS; flg++) {
+ if (test_sta_flag(sta, flg))
+ pos += scnprintf(pos, end - pos, "%s\n",
+ sta_flag_names[flg]);
+ }
-#define TEST(flg) \
- test_sta_flag(sta, WLAN_STA_##flg) ? #flg "\n" : ""
-
- int res = scnprintf(buf, sizeof(buf),
- "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
- TEST(AUTH), TEST(ASSOC), TEST(PS_STA),
- TEST(PS_DRIVER), TEST(AUTHORIZED),
- TEST(SHORT_PREAMBLE),
- sta->sta.wme ? "WME\n" : "",
- TEST(WDS), TEST(CLEAR_PS_FILT),
- TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL),
- TEST(UAPSD), TEST(SP), TEST(TDLS_PEER),
- TEST(TDLS_PEER_AUTH), TEST(TDLS_INITIATOR),
- TEST(TDLS_CHAN_SWITCH), TEST(TDLS_OFF_CHANNEL),
- TEST(4ADDR_EVENT), TEST(INSERTED),
- TEST(RATE_CONTROL), TEST(TOFFSET_KNOWN),
- TEST(MPSP_OWNER), TEST(MPSP_RECIPIENT));
-#undef TEST
- return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+ return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
}
STA_OPS(flags);
@@ -151,11 +175,12 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf,
size_t count, loff_t *ppos)
{
- char _buf[12] = {}, *buf = _buf;
+ char _buf[25] = {}, *buf = _buf;
struct sta_info *sta = file->private_data;
bool start, tx;
unsigned long tid;
- int ret;
+ char *pos;
+ int ret, timeout = 5000;
if (count > sizeof(_buf))
return -EINVAL;
@@ -164,37 +189,48 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
return -EFAULT;
buf[sizeof(_buf) - 1] = '\0';
+ pos = buf;
+ buf = strsep(&pos, " ");
+ if (!buf)
+ return -EINVAL;
- if (strncmp(buf, "tx ", 3) == 0) {
- buf += 3;
+ if (!strcmp(buf, "tx"))
tx = true;
- } else if (strncmp(buf, "rx ", 3) == 0) {
- buf += 3;
+ else if (!strcmp(buf, "rx"))
tx = false;
- } else
+ else
return -EINVAL;
- if (strncmp(buf, "start ", 6) == 0) {
- buf += 6;
+ buf = strsep(&pos, " ");
+ if (!buf)
+ return -EINVAL;
+ if (!strcmp(buf, "start")) {
start = true;
if (!tx)
return -EINVAL;
- } else if (strncmp(buf, "stop ", 5) == 0) {
- buf += 5;
+ } else if (!strcmp(buf, "stop")) {
start = false;
- } else
+ } else {
return -EINVAL;
+ }
- ret = kstrtoul(buf, 0, &tid);
- if (ret)
- return ret;
+ buf = strsep(&pos, " ");
+ if (!buf)
+ return -EINVAL;
+ if (sscanf(buf, "timeout=%d", &timeout) == 1) {
+ buf = strsep(&pos, " ");
+ if (!buf || !tx || !start)
+ return -EINVAL;
+ }
- if (tid >= IEEE80211_NUM_TIDS)
+ ret = kstrtoul(buf, 0, &tid);
+ if (ret || tid >= IEEE80211_NUM_TIDS)
return -EINVAL;
if (tx) {
if (start)
- ret = ieee80211_start_tx_ba_session(&sta->sta, tid, 5000);
+ ret = ieee80211_start_tx_ba_session(&sta->sta, tid,
+ timeout);
else
ret = ieee80211_stop_tx_ba_session(&sta->sta, tid);
} else {
@@ -322,14 +358,14 @@ STA_OPS(vht_capa);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, \
- sta->debugfs.dir, sta, &sta_ ##name## _ops);
+ sta->debugfs_dir, sta, &sta_ ##name## _ops);
#define DEBUGFS_ADD_COUNTER(name, field) \
if (sizeof(sta->field) == sizeof(u32)) \
- debugfs_create_u32(#name, 0400, sta->debugfs.dir, \
+ debugfs_create_u32(#name, 0400, sta->debugfs_dir, \
(u32 *) &sta->field); \
else \
- debugfs_create_u64(#name, 0400, sta->debugfs.dir, \
+ debugfs_create_u64(#name, 0400, sta->debugfs_dir, \
(u64 *) &sta->field);
void ieee80211_sta_debugfs_add(struct sta_info *sta)
@@ -339,8 +375,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations;
u8 mac[3*ETH_ALEN];
- sta->debugfs.add_has_run = true;
-
if (!stations_dir)
return;
@@ -355,8 +389,8 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
* destroyed quickly enough the old station's debugfs
* dir might still be around.
*/
- sta->debugfs.dir = debugfs_create_dir(mac, stations_dir);
- if (!sta->debugfs.dir)
+ sta->debugfs_dir = debugfs_create_dir(mac, stations_dir);
+ if (!sta->debugfs_dir)
return;
DEBUGFS_ADD(flags);
@@ -372,14 +406,14 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
debugfs_create_x32("driver_buffered_tids", 0400,
- sta->debugfs.dir,
+ sta->debugfs_dir,
(u32 *)&sta->driver_buffered_tids);
else
debugfs_create_x64("driver_buffered_tids", 0400,
- sta->debugfs.dir,
+ sta->debugfs_dir,
(u64 *)&sta->driver_buffered_tids);
- drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs.dir);
+ drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
}
void ieee80211_sta_debugfs_remove(struct sta_info *sta)
@@ -387,7 +421,7 @@ void ieee80211_sta_debugfs_remove(struct sta_info *sta)
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs.dir);
- debugfs_remove_recursive(sta->debugfs.dir);
- sta->debugfs.dir = NULL;
+ drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
+ debugfs_remove_recursive(sta->debugfs_dir);
+ sta->debugfs_dir = NULL;
}
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 18b0d65ba..184473c25 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1,3 +1,8 @@
+/*
+* Portions of this file
+* Copyright(c) 2016 Intel Deutschland GmbH
+*/
+
#ifndef __MAC80211_DRIVER_OPS
#define __MAC80211_DRIVER_OPS
@@ -29,6 +34,16 @@ static inline void drv_tx(struct ieee80211_local *local,
local->ops->tx(&local->hw, control, skb);
}
+static inline void drv_sync_rx_queues(struct ieee80211_local *local,
+ struct sta_info *sta)
+{
+ if (local->ops->sync_rx_queues) {
+ trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta);
+ local->ops->sync_rx_queues(&local->hw);
+ trace_drv_return_void(local);
+ }
+}
+
static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata,
u32 sset, u8 *data)
{
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index fc3238376..a31d30713 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -126,7 +126,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
}
}
- if (sband->band == IEEE80211_BAND_2GHZ) {
+ if (sband->band == NL80211_BAND_2GHZ) {
*pos++ = WLAN_EID_DS_PARAMS;
*pos++ = 1;
*pos++ = ieee80211_frequency_to_channel(
@@ -348,11 +348,11 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
*
* HT follows these specifications (IEEE 802.11-2012 20.3.18)
*/
- sdata->vif.bss_conf.use_short_slot = chan->band == IEEE80211_BAND_5GHZ;
+ sdata->vif.bss_conf.use_short_slot = chan->band == NL80211_BAND_5GHZ;
bss_change |= BSS_CHANGED_ERP_SLOT;
/* cf. IEEE 802.11 9.2.12 */
- if (chan->band == IEEE80211_BAND_2GHZ && have_higher_than_11mbit)
+ if (chan->band == NL80211_BAND_2GHZ && have_higher_than_11mbit)
sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
else
sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
@@ -649,8 +649,6 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
return NULL;
}
- sta->rx_stats.last_rx = jiffies;
-
/* make sure mandatory rates are always added */
sband = local->hw.wiphy->bands[band];
sta->sta.supp_rates[band] = supp_rates |
@@ -670,10 +668,11 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
rcu_read_lock();
list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ unsigned long last_active = ieee80211_sta_last_active(sta);
+
if (sta->sdata == sdata &&
- time_after(sta->rx_stats.last_rx +
- IEEE80211_IBSS_MERGE_INTERVAL,
- jiffies)) {
+ time_is_after_jiffies(last_active +
+ IEEE80211_IBSS_MERGE_INTERVAL)) {
active++;
break;
}
@@ -990,7 +989,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel)
{
struct sta_info *sta;
- enum ieee80211_band band = rx_status->band;
+ enum nl80211_band band = rx_status->band;
enum nl80211_bss_scan_width scan_width;
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
@@ -1110,7 +1109,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel;
u64 beacon_timestamp, rx_timestamp;
u32 supp_rates = 0;
- enum ieee80211_band band = rx_status->band;
+ enum nl80211_band band = rx_status->band;
channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
if (!channel)
@@ -1236,8 +1235,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
if (!sta)
return;
- sta->rx_stats.last_rx = jiffies;
-
/* make sure mandatory rates are always added */
sband = local->hw.wiphy->bands[band];
sta->sta.supp_rates[band] = supp_rates |
@@ -1259,11 +1256,13 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
mutex_lock(&local->sta_mtx);
list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+ unsigned long last_active = ieee80211_sta_last_active(sta);
+
if (sdata != sta->sdata)
continue;
- if (time_after(jiffies, sta->rx_stats.last_rx + exp_time) ||
- (time_after(jiffies, sta->rx_stats.last_rx + exp_rsn) &&
+ if (time_is_before_jiffies(last_active + exp_time) ||
+ (time_is_before_jiffies(last_active + exp_rsn) &&
sta->sta_state != IEEE80211_STA_AUTHORIZED)) {
sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n",
sta->sta_state != IEEE80211_STA_AUTHORIZED ?
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 422003540..9438c9406 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -696,6 +696,11 @@ struct ieee80211_if_mesh {
/* offset from skb->data while building IE */
int meshconf_offset;
+
+ struct mesh_table *mesh_paths;
+ struct mesh_table *mpp_paths; /* Store paths for MPP&MAP */
+ int mesh_paths_generation;
+ int mpp_paths_generation;
};
#ifdef CONFIG_MAC80211_MESH
@@ -797,6 +802,7 @@ struct mac80211_qos_map {
enum txq_info_flags {
IEEE80211_TXQ_STOP,
IEEE80211_TXQ_AMPDU,
+ IEEE80211_TXQ_NO_AMSDU,
};
struct txq_info {
@@ -890,13 +896,13 @@ struct ieee80211_sub_if_data {
struct ieee80211_if_ap *bss;
/* bitmap of allowed (non-MCS) rate indexes for rate control */
- u32 rc_rateidx_mask[IEEE80211_NUM_BANDS];
+ u32 rc_rateidx_mask[NUM_NL80211_BANDS];
- bool rc_has_mcs_mask[IEEE80211_NUM_BANDS];
- u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN];
+ bool rc_has_mcs_mask[NUM_NL80211_BANDS];
+ u8 rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN];
- bool rc_has_vht_mcs_mask[IEEE80211_NUM_BANDS];
- u16 rc_rateidx_vht_mcs_mask[IEEE80211_NUM_BANDS][NL80211_VHT_NSS_MAX];
+ bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS];
+ u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX];
union {
struct ieee80211_if_ap ap;
@@ -951,10 +957,10 @@ sdata_assert_lock(struct ieee80211_sub_if_data *sdata)
lockdep_assert_held(&sdata->wdev.mtx);
}
-static inline enum ieee80211_band
+static inline enum nl80211_band
ieee80211_get_sdata_band(struct ieee80211_sub_if_data *sdata)
{
- enum ieee80211_band band = IEEE80211_BAND_2GHZ;
+ enum nl80211_band band = NL80211_BAND_2GHZ;
struct ieee80211_chanctx_conf *chanctx_conf;
rcu_read_lock();
@@ -1225,7 +1231,7 @@ struct ieee80211_local {
struct cfg80211_scan_request __rcu *scan_req;
struct ieee80211_scan_request *hw_scan_req;
struct cfg80211_chan_def scan_chandef;
- enum ieee80211_band hw_scan_band;
+ enum nl80211_band hw_scan_band;
int scan_channel_idx;
int scan_ies_len;
int hw_scan_ies_bufsize;
@@ -1489,6 +1495,11 @@ u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local);
int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
u64 *cookie, gfp_t gfp);
+void ieee80211_check_fast_rx(struct sta_info *sta);
+void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_clear_fast_rx(struct sta_info *sta);
+
/* STA code */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
@@ -1727,10 +1738,10 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band);
+ enum nl80211_band band);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band);
+ enum nl80211_band band);
void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sta_vht_cap *vht_cap);
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
@@ -1758,7 +1769,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
*/
int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
- enum ieee80211_band current_band,
+ enum nl80211_band current_band,
u32 sta_flags, u8 *bssid,
struct ieee80211_csa_ie *csa_ie);
@@ -1783,7 +1794,7 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw)
/* utility functions/constants */
extern const void *const mac80211_wiphy_privid; /* for wiphy privid */
-int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
+int ieee80211_frame_duration(enum nl80211_band band, size_t len,
int rate, int erp, int short_preamble,
int shift);
void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
@@ -1793,12 +1804,12 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum ieee80211_band band);
+ enum nl80211_band band);
static inline void
ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
rcu_read_lock();
__ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
@@ -1953,7 +1964,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
- enum ieee80211_band band, u32 *basic_rates);
+ enum nl80211_band band, u32 *basic_rates);
int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
enum ieee80211_smps_mode smps_mode);
int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
@@ -1976,10 +1987,10 @@ int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
const u8 *srates, int srates_len, u32 *rates);
int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, bool need_basic,
- enum ieee80211_band band);
+ enum nl80211_band band);
int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, bool need_basic,
- enum ieee80211_band band);
+ enum nl80211_band band);
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
/* channel management */
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index e1cb22c16..c59af3eb9 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1093,7 +1093,7 @@ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata)
sdata->fragment_next = 0;
if (ieee80211_vif_is_mesh(&sdata->vif))
- mesh_rmc_free(sdata);
+ ieee80211_mesh_teardown_sdata(sdata);
}
static void ieee80211_uninit(struct net_device *dev)
@@ -1800,7 +1800,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk,
ieee80211_delayed_tailroom_dec);
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
struct ieee80211_supported_band *sband;
sband = local->hw.wiphy->bands[i];
sdata->rc_rateidx_mask[i] =
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 3df7b0392..edd6f2945 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -338,6 +338,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
} else {
rcu_assign_pointer(sta->gtk[idx], new);
}
+ ieee80211_check_fast_rx(sta);
} else {
defunikey = old &&
old == key_mtx_dereference(sdata->local,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 8190bf27e..7ee91d615 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -558,6 +558,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
if (!ops->set_key)
wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
+ wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM);
+
wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
local = wiphy_priv(wiphy);
@@ -799,7 +801,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
int result, i;
- enum ieee80211_band band;
+ enum nl80211_band band;
int channels, max_bitrates;
bool supp_ht, supp_vht;
netdev_features_t feature_whitelist;
@@ -854,7 +856,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* Only HW csum features are currently compatible with mac80211 */
feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA |
- NETIF_F_GSO_SOFTWARE;
+ NETIF_F_GSO_SOFTWARE | NETIF_F_RXCSUM;
if (WARN_ON(hw->netdev_features & ~feature_whitelist))
return -EINVAL;
@@ -872,7 +874,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
max_bitrates = 0;
supp_ht = false;
supp_vht = false;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
sband = local->hw.wiphy->bands[band];
@@ -934,7 +936,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (!local->int_scan_req)
return -ENOMEM;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
if (!local->hw.wiphy->bands[band])
continue;
local->int_scan_req->rates[band] = (u32) -1;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 34a5712d4..6a1603bcd 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -25,7 +25,6 @@ bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt)
void ieee80211s_init(void)
{
- mesh_pathtbl_init();
mesh_allocated = 1;
rm_cache = kmem_cache_create("mesh_rmc", sizeof(struct rmc_entry),
0, 0, NULL);
@@ -35,7 +34,6 @@ void ieee80211s_stop(void)
{
if (!mesh_allocated)
return;
- mesh_pathtbl_unregister();
kmem_cache_destroy(rm_cache);
}
@@ -183,22 +181,23 @@ int mesh_rmc_init(struct ieee80211_sub_if_data *sdata)
return -ENOMEM;
sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1;
for (i = 0; i < RMC_BUCKETS; i++)
- INIT_LIST_HEAD(&sdata->u.mesh.rmc->bucket[i]);
+ INIT_HLIST_HEAD(&sdata->u.mesh.rmc->bucket[i]);
return 0;
}
void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
{
struct mesh_rmc *rmc = sdata->u.mesh.rmc;
- struct rmc_entry *p, *n;
+ struct rmc_entry *p;
+ struct hlist_node *n;
int i;
if (!sdata->u.mesh.rmc)
return;
for (i = 0; i < RMC_BUCKETS; i++) {
- list_for_each_entry_safe(p, n, &rmc->bucket[i], list) {
- list_del(&p->list);
+ hlist_for_each_entry_safe(p, n, &rmc->bucket[i], list) {
+ hlist_del(&p->list);
kmem_cache_free(rm_cache, p);
}
}
@@ -227,16 +226,20 @@ int mesh_rmc_check(struct ieee80211_sub_if_data *sdata,
u32 seqnum = 0;
int entries = 0;
u8 idx;
- struct rmc_entry *p, *n;
+ struct rmc_entry *p;
+ struct hlist_node *n;
+
+ if (!rmc)
+ return -1;
/* Don't care about endianness since only match matters */
memcpy(&seqnum, &mesh_hdr->seqnum, sizeof(mesh_hdr->seqnum));
idx = le32_to_cpu(mesh_hdr->seqnum) & rmc->idx_mask;
- list_for_each_entry_safe(p, n, &rmc->bucket[idx], list) {
+ hlist_for_each_entry_safe(p, n, &rmc->bucket[idx], list) {
++entries;
if (time_after(jiffies, p->exp_time) ||
entries == RMC_QUEUE_MAX_LEN) {
- list_del(&p->list);
+ hlist_del(&p->list);
kmem_cache_free(rm_cache, p);
--entries;
} else if ((seqnum == p->seqnum) && ether_addr_equal(sa, p->sa))
@@ -250,7 +253,7 @@ int mesh_rmc_check(struct ieee80211_sub_if_data *sdata,
p->seqnum = seqnum;
p->exp_time = jiffies + RMC_TIMEOUT;
memcpy(p->sa, sa, ETH_ALEN);
- list_add(&p->list, &rmc->bucket[idx]);
+ hlist_add_head(&p->list, &rmc->bucket[idx]);
return 0;
}
@@ -419,7 +422,7 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
struct ieee80211_local *local = sdata->local;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_supported_band *sband;
u8 *pos;
@@ -482,7 +485,7 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
struct ieee80211_local *local = sdata->local;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_supported_band *sband;
u8 *pos;
@@ -684,7 +687,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
struct ieee80211_mgmt *mgmt;
struct ieee80211_chanctx_conf *chanctx_conf;
struct mesh_csa_settings *csa;
- enum ieee80211_band band;
+ enum nl80211_band band;
u8 *pos;
struct ieee80211_sub_if_data *sdata;
int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) +
@@ -934,7 +937,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
struct cfg80211_csa_settings params;
struct ieee80211_csa_ie csa_ie;
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
int err;
u32 sta_flags;
@@ -1088,7 +1091,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel;
size_t baselen;
int freq;
- enum ieee80211_band band = rx_status->band;
+ enum nl80211_band band = rx_status->band;
/* ignore ProbeResp to foreign address */
if (stype == IEEE80211_STYPE_PROBE_RESP &&
@@ -1355,12 +1358,6 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
ifmsh->last_preq + msecs_to_jiffies(ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval)))
mesh_path_start_discovery(sdata);
- if (test_and_clear_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags))
- mesh_mpath_table_grow();
-
- if (test_and_clear_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags))
- mesh_mpp_table_grow();
-
if (test_and_clear_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags))
ieee80211_mesh_housekeeping(sdata);
@@ -1395,6 +1392,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
/* Allocate all mesh structures when creating the first mesh interface. */
if (!mesh_allocated)
ieee80211s_init();
+
+ mesh_pathtbl_init(sdata);
+
setup_timer(&ifmsh->mesh_path_timer,
ieee80211_mesh_path_timer,
(unsigned long) sdata);
@@ -1409,3 +1409,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
sdata->vif.bss_conf.bssid = zero_addr;
}
+
+void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ mesh_rmc_free(sdata);
+ mesh_pathtbl_unregister(sdata);
+}
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 87c017a3b..26b9ccbe1 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -21,8 +21,6 @@
/**
* enum mesh_path_flags - mac80211 mesh path flags
*
- *
- *
* @MESH_PATH_ACTIVE: the mesh path can be used for forwarding
* @MESH_PATH_RESOLVING: the discovery process is running for this mesh path
* @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence
@@ -32,6 +30,8 @@
* @MESH_PATH_RESOLVED: the mesh path can has been resolved
* @MESH_PATH_REQ_QUEUED: there is an unsent path request for this destination
* already queued up, waiting for the discovery process to start.
+ * @MESH_PATH_DELETED: the mesh path has been deleted and should no longer
+ * be used
*
* MESH_PATH_RESOLVED is used by the mesh path timer to
* decide when to stop or cancel the mesh path discovery.
@@ -43,6 +43,7 @@ enum mesh_path_flags {
MESH_PATH_FIXED = BIT(3),
MESH_PATH_RESOLVED = BIT(4),
MESH_PATH_REQ_QUEUED = BIT(5),
+ MESH_PATH_DELETED = BIT(6),
};
/**
@@ -51,10 +52,6 @@ enum mesh_path_flags {
*
*
* @MESH_WORK_HOUSEKEEPING: run the periodic mesh housekeeping tasks
- * @MESH_WORK_GROW_MPATH_TABLE: the mesh path table is full and needs
- * to grow.
- * @MESH_WORK_GROW_MPP_TABLE: the mesh portals table is full and needs to
- * grow
* @MESH_WORK_ROOT: the mesh root station needs to send a frame
* @MESH_WORK_DRIFT_ADJUST: time to compensate for clock drift relative to other
* mesh nodes
@@ -62,8 +59,6 @@ enum mesh_path_flags {
*/
enum mesh_deferred_task_flags {
MESH_WORK_HOUSEKEEPING,
- MESH_WORK_GROW_MPATH_TABLE,
- MESH_WORK_GROW_MPP_TABLE,
MESH_WORK_ROOT,
MESH_WORK_DRIFT_ADJUST,
MESH_WORK_MBSS_CHANGED,
@@ -73,12 +68,16 @@ enum mesh_deferred_task_flags {
* struct mesh_path - mac80211 mesh path structure
*
* @dst: mesh path destination mac address
+ * @mpp: mesh proxy mac address
+ * @rhash: rhashtable list pointer
+ * @gate_list: list pointer for known gates list
* @sdata: mesh subif
* @next_hop: mesh neighbor to which frames for this destination will be
* forwarded
* @timer: mesh path discovery timer
* @frame_queue: pending queue for frames sent to this destination while the
* path is unresolved
+ * @rcu: rcu head for freeing mesh path
* @sn: target sequence number
* @metric: current metric to this destination
* @hop_count: hops to destination
@@ -97,14 +96,16 @@ enum mesh_deferred_task_flags {
* @is_gate: the destination station of this path is a mesh gate
*
*
- * The combination of dst and sdata is unique in the mesh path table. Since the
- * next_hop STA is only protected by RCU as well, deleting the STA must also
- * remove/substitute the mesh_path structure and wait until that is no longer
- * reachable before destroying the STA completely.
+ * The dst address is unique in the mesh path table. Since the mesh_path is
+ * protected by RCU, deleting the next_hop STA must remove / substitute the
+ * mesh_path structure and wait until that is no longer reachable before
+ * destroying the STA completely.
*/
struct mesh_path {
u8 dst[ETH_ALEN];
u8 mpp[ETH_ALEN]; /* used for MPP or MAP */
+ struct rhash_head rhash;
+ struct hlist_node gate_list;
struct ieee80211_sub_if_data *sdata;
struct sta_info __rcu *next_hop;
struct timer_list timer;
@@ -128,34 +129,17 @@ struct mesh_path {
/**
* struct mesh_table
*
- * @hash_buckets: array of hash buckets of the table
- * @hashwlock: array of locks to protect write operations, one per bucket
- * @hash_mask: 2^size_order - 1, used to compute hash idx
- * @hash_rnd: random value used for hash computations
- * @entries: number of entries in the table
- * @free_node: function to free nodes of the table
- * @copy_node: function to copy nodes of the table
- * @size_order: determines size of the table, there will be 2^size_order hash
- * buckets
* @known_gates: list of known mesh gates and their mpaths by the station. The
* gate's mpath may or may not be resolved and active.
- *
- * rcu_head: RCU head to free the table
+ * @gates_lock: protects updates to known_gates
+ * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr
+ * @entries: number of entries in the table
*/
struct mesh_table {
- /* Number of buckets will be 2^N */
- struct hlist_head *hash_buckets;
- spinlock_t *hashwlock; /* One per bucket, for add/del */
- unsigned int hash_mask; /* (2^size_order) - 1 */
- __u32 hash_rnd; /* Used for hash generation */
- atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */
- void (*free_node) (struct hlist_node *p, bool free_leafs);
- int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl);
- int size_order;
- struct hlist_head *known_gates;
+ struct hlist_head known_gates;
spinlock_t gates_lock;
-
- struct rcu_head rcu_head;
+ struct rhashtable rhead;
+ atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */
};
/* Recent multicast cache */
@@ -170,20 +154,21 @@ struct mesh_table {
* @seqnum: mesh sequence number of the frame
* @exp_time: expiration time of the entry, in jiffies
* @sa: source address of the frame
+ * @list: hashtable list pointer
*
* The Recent Multicast Cache keeps track of the latest multicast frames that
* have been received by a mesh interface and discards received multicast frames
* that are found in the cache.
*/
struct rmc_entry {
- struct list_head list;
- u32 seqnum;
+ struct hlist_node list;
unsigned long exp_time;
+ u32 seqnum;
u8 sa[ETH_ALEN];
};
struct mesh_rmc {
- struct list_head bucket[RMC_BUCKETS];
+ struct hlist_head bucket[RMC_BUCKETS];
u32 idx_mask;
};
@@ -234,6 +219,7 @@ void ieee80211s_init(void);
void ieee80211s_update_metric(struct ieee80211_local *local,
struct sta_info *sta, struct sk_buff *skb);
void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata);
void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh);
@@ -299,9 +285,6 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
void mesh_sta_cleanup(struct sta_info *sta);
/* Private interfaces */
-/* Mesh tables */
-void mesh_mpath_table_grow(void);
-void mesh_mpp_table_grow(void);
/* Mesh paths */
int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
u8 ttl, const u8 *target, u32 target_sn,
@@ -309,8 +292,8 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta);
void mesh_path_flush_pending(struct mesh_path *mpath);
void mesh_path_tx_pending(struct mesh_path *mpath);
-int mesh_pathtbl_init(void);
-void mesh_pathtbl_unregister(void);
+int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata);
+void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata);
int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr);
void mesh_path_timer(unsigned long data);
void mesh_path_flush_by_nexthop(struct sta_info *sta);
@@ -319,8 +302,6 @@ void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata,
void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata);
bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt);
-extern int mesh_paths_generation;
-extern int mpp_paths_generation;
#ifdef CONFIG_MAC80211_MESH
static inline
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 002244bca..8f9c3bde8 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -1012,6 +1012,10 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
goto enddiscovery;
spin_lock_bh(&mpath->state_lock);
+ if (mpath->flags & MESH_PATH_DELETED) {
+ spin_unlock_bh(&mpath->state_lock);
+ goto enddiscovery;
+ }
mpath->flags &= ~MESH_PATH_REQ_QUEUED;
if (preq_node->flags & PREQ_Q_F_START) {
if (mpath->flags & MESH_PATH_RESOLVING) {
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 2ba7aa56b..6db2ddfa0 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -18,11 +18,22 @@
#include "ieee80211_i.h"
#include "mesh.h"
-/* There will be initially 2^INIT_PATHS_SIZE_ORDER buckets */
-#define INIT_PATHS_SIZE_ORDER 2
+static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath);
-/* Keep the mean chain length below this constant */
-#define MEAN_CHAIN_LEN 2
+static u32 mesh_table_hash(const void *addr, u32 len, u32 seed)
+{
+ /* Use last four bytes of hw addr as hash index */
+ return jhash_1word(*(u32 *)(addr+2), seed);
+}
+
+static const struct rhashtable_params mesh_rht_params = {
+ .nelem_hint = 2,
+ .automatic_shrinking = true,
+ .key_len = ETH_ALEN,
+ .key_offset = offsetof(struct mesh_path, dst),
+ .head_offset = offsetof(struct mesh_path, rhash),
+ .hashfn = mesh_table_hash,
+};
static inline bool mpath_expired(struct mesh_path *mpath)
{
@@ -31,173 +42,36 @@ static inline bool mpath_expired(struct mesh_path *mpath)
!(mpath->flags & MESH_PATH_FIXED);
}
-struct mpath_node {
- struct hlist_node list;
- struct rcu_head rcu;
- /* This indirection allows two different tables to point to the same
- * mesh_path structure, useful when resizing
- */
- struct mesh_path *mpath;
-};
-
-static struct mesh_table __rcu *mesh_paths;
-static struct mesh_table __rcu *mpp_paths; /* Store paths for MPP&MAP */
-
-int mesh_paths_generation;
-int mpp_paths_generation;
-
-/* This lock will have the grow table function as writer and add / delete nodes
- * as readers. RCU provides sufficient protection only when reading the table
- * (i.e. doing lookups). Adding or adding or removing nodes requires we take
- * the read lock or we risk operating on an old table. The write lock is only
- * needed when modifying the number of buckets a table.
- */
-static DEFINE_RWLOCK(pathtbl_resize_lock);
-
-
-static inline struct mesh_table *resize_dereference_paths(
- struct mesh_table __rcu *table)
+static void mesh_path_rht_free(void *ptr, void *tblptr)
{
- return rcu_dereference_protected(table,
- lockdep_is_held(&pathtbl_resize_lock));
-}
+ struct mesh_path *mpath = ptr;
+ struct mesh_table *tbl = tblptr;
-static inline struct mesh_table *resize_dereference_mesh_paths(void)
-{
- return resize_dereference_paths(mesh_paths);
+ mesh_path_free_rcu(tbl, mpath);
}
-static inline struct mesh_table *resize_dereference_mpp_paths(void)
+static struct mesh_table *mesh_table_alloc(void)
{
- return resize_dereference_paths(mpp_paths);
-}
-
-/*
- * CAREFUL -- "tbl" must not be an expression,
- * in particular not an rcu_dereference(), since
- * it's used twice. So it is illegal to do
- * for_each_mesh_entry(rcu_dereference(...), ...)
- */
-#define for_each_mesh_entry(tbl, node, i) \
- for (i = 0; i <= tbl->hash_mask; i++) \
- hlist_for_each_entry_rcu(node, &tbl->hash_buckets[i], list)
-
-
-static struct mesh_table *mesh_table_alloc(int size_order)
-{
- int i;
struct mesh_table *newtbl;
newtbl = kmalloc(sizeof(struct mesh_table), GFP_ATOMIC);
if (!newtbl)
return NULL;
- newtbl->hash_buckets = kzalloc(sizeof(struct hlist_head) *
- (1 << size_order), GFP_ATOMIC);
-
- if (!newtbl->hash_buckets) {
- kfree(newtbl);
- return NULL;
- }
-
- newtbl->hashwlock = kmalloc(sizeof(spinlock_t) *
- (1 << size_order), GFP_ATOMIC);
- if (!newtbl->hashwlock) {
- kfree(newtbl->hash_buckets);
- kfree(newtbl);
- return NULL;
- }
-
- newtbl->size_order = size_order;
- newtbl->hash_mask = (1 << size_order) - 1;
+ INIT_HLIST_HEAD(&newtbl->known_gates);
atomic_set(&newtbl->entries, 0);
- get_random_bytes(&newtbl->hash_rnd,
- sizeof(newtbl->hash_rnd));
- for (i = 0; i <= newtbl->hash_mask; i++)
- spin_lock_init(&newtbl->hashwlock[i]);
spin_lock_init(&newtbl->gates_lock);
return newtbl;
}
-static void __mesh_table_free(struct mesh_table *tbl)
+static void mesh_table_free(struct mesh_table *tbl)
{
- kfree(tbl->hash_buckets);
- kfree(tbl->hashwlock);
+ rhashtable_free_and_destroy(&tbl->rhead,
+ mesh_path_rht_free, tbl);
kfree(tbl);
}
-static void mesh_table_free(struct mesh_table *tbl, bool free_leafs)
-{
- struct hlist_head *mesh_hash;
- struct hlist_node *p, *q;
- struct mpath_node *gate;
- int i;
-
- mesh_hash = tbl->hash_buckets;
- for (i = 0; i <= tbl->hash_mask; i++) {
- spin_lock_bh(&tbl->hashwlock[i]);
- hlist_for_each_safe(p, q, &mesh_hash[i]) {
- tbl->free_node(p, free_leafs);
- atomic_dec(&tbl->entries);
- }
- spin_unlock_bh(&tbl->hashwlock[i]);
- }
- if (free_leafs) {
- spin_lock_bh(&tbl->gates_lock);
- hlist_for_each_entry_safe(gate, q,
- tbl->known_gates, list) {
- hlist_del(&gate->list);
- kfree(gate);
- }
- kfree(tbl->known_gates);
- spin_unlock_bh(&tbl->gates_lock);
- }
-
- __mesh_table_free(tbl);
-}
-
-static int mesh_table_grow(struct mesh_table *oldtbl,
- struct mesh_table *newtbl)
-{
- struct hlist_head *oldhash;
- struct hlist_node *p, *q;
- int i;
-
- if (atomic_read(&oldtbl->entries)
- < MEAN_CHAIN_LEN * (oldtbl->hash_mask + 1))
- return -EAGAIN;
-
- newtbl->free_node = oldtbl->free_node;
- newtbl->copy_node = oldtbl->copy_node;
- newtbl->known_gates = oldtbl->known_gates;
- atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries));
-
- oldhash = oldtbl->hash_buckets;
- for (i = 0; i <= oldtbl->hash_mask; i++)
- hlist_for_each(p, &oldhash[i])
- if (oldtbl->copy_node(p, newtbl) < 0)
- goto errcopy;
-
- return 0;
-
-errcopy:
- for (i = 0; i <= newtbl->hash_mask; i++) {
- hlist_for_each_safe(p, q, &newtbl->hash_buckets[i])
- oldtbl->free_node(p, 0);
- }
- return -ENOMEM;
-}
-
-static u32 mesh_table_hash(const u8 *addr, struct ieee80211_sub_if_data *sdata,
- struct mesh_table *tbl)
-{
- /* Use last four bytes of hw addr and interface index as hash index */
- return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex,
- tbl->hash_rnd) & tbl->hash_mask;
-}
-
-
/**
*
* mesh_path_assign_nexthop - update mesh path next hop
@@ -340,23 +214,15 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst,
struct ieee80211_sub_if_data *sdata)
{
struct mesh_path *mpath;
- struct hlist_head *bucket;
- struct mpath_node *node;
-
- bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)];
- hlist_for_each_entry_rcu(node, bucket, list) {
- mpath = node->mpath;
- if (mpath->sdata == sdata &&
- ether_addr_equal(dst, mpath->dst)) {
- if (mpath_expired(mpath)) {
- spin_lock_bh(&mpath->state_lock);
- mpath->flags &= ~MESH_PATH_ACTIVE;
- spin_unlock_bh(&mpath->state_lock);
- }
- return mpath;
- }
+
+ mpath = rhashtable_lookup_fast(&tbl->rhead, dst, mesh_rht_params);
+
+ if (mpath && mpath_expired(mpath)) {
+ spin_lock_bh(&mpath->state_lock);
+ mpath->flags &= ~MESH_PATH_ACTIVE;
+ spin_unlock_bh(&mpath->state_lock);
}
- return NULL;
+ return mpath;
}
/**
@@ -371,15 +237,52 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst,
struct mesh_path *
mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst)
{
- return mpath_lookup(rcu_dereference(mesh_paths), dst, sdata);
+ return mpath_lookup(sdata->u.mesh.mesh_paths, dst, sdata);
}
struct mesh_path *
mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst)
{
- return mpath_lookup(rcu_dereference(mpp_paths), dst, sdata);
+ return mpath_lookup(sdata->u.mesh.mpp_paths, dst, sdata);
}
+static struct mesh_path *
+__mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
+{
+ int i = 0, ret;
+ struct mesh_path *mpath = NULL;
+ struct rhashtable_iter iter;
+
+ ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
+ if (ret)
+ return NULL;
+
+ ret = rhashtable_walk_start(&iter);
+ if (ret && ret != -EAGAIN)
+ goto err;
+
+ while ((mpath = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
+ continue;
+ if (IS_ERR(mpath))
+ break;
+ if (i++ == idx)
+ break;
+ }
+err:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
+
+ if (IS_ERR(mpath) || !mpath)
+ return NULL;
+
+ if (mpath_expired(mpath)) {
+ spin_lock_bh(&mpath->state_lock);
+ mpath->flags &= ~MESH_PATH_ACTIVE;
+ spin_unlock_bh(&mpath->state_lock);
+ }
+ return mpath;
+}
/**
* mesh_path_lookup_by_idx - look up a path in the mesh path table by its index
@@ -393,25 +296,7 @@ mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst)
struct mesh_path *
mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx)
{
- struct mesh_table *tbl = rcu_dereference(mesh_paths);
- struct mpath_node *node;
- int i;
- int j = 0;
-
- for_each_mesh_entry(tbl, node, i) {
- if (sdata && node->mpath->sdata != sdata)
- continue;
- if (j++ == idx) {
- if (mpath_expired(node->mpath)) {
- spin_lock_bh(&node->mpath->state_lock);
- node->mpath->flags &= ~MESH_PATH_ACTIVE;
- spin_unlock_bh(&node->mpath->state_lock);
- }
- return node->mpath;
- }
- }
-
- return NULL;
+ return __mesh_path_lookup_by_idx(sdata->u.mesh.mesh_paths, idx);
}
/**
@@ -426,19 +311,7 @@ mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx)
struct mesh_path *
mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx)
{
- struct mesh_table *tbl = rcu_dereference(mpp_paths);
- struct mpath_node *node;
- int i;
- int j = 0;
-
- for_each_mesh_entry(tbl, node, i) {
- if (sdata && node->mpath->sdata != sdata)
- continue;
- if (j++ == idx)
- return node->mpath;
- }
-
- return NULL;
+ return __mesh_path_lookup_by_idx(sdata->u.mesh.mpp_paths, idx);
}
/**
@@ -448,30 +321,26 @@ mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx)
int mesh_path_add_gate(struct mesh_path *mpath)
{
struct mesh_table *tbl;
- struct mpath_node *gate, *new_gate;
int err;
rcu_read_lock();
- tbl = rcu_dereference(mesh_paths);
-
- hlist_for_each_entry_rcu(gate, tbl->known_gates, list)
- if (gate->mpath == mpath) {
- err = -EEXIST;
- goto err_rcu;
- }
+ tbl = mpath->sdata->u.mesh.mesh_paths;
- new_gate = kzalloc(sizeof(struct mpath_node), GFP_ATOMIC);
- if (!new_gate) {
- err = -ENOMEM;
+ spin_lock_bh(&mpath->state_lock);
+ if (mpath->is_gate) {
+ err = -EEXIST;
+ spin_unlock_bh(&mpath->state_lock);
goto err_rcu;
}
-
mpath->is_gate = true;
mpath->sdata->u.mesh.num_gates++;
- new_gate->mpath = mpath;
- spin_lock_bh(&tbl->gates_lock);
- hlist_add_head_rcu(&new_gate->list, tbl->known_gates);
- spin_unlock_bh(&tbl->gates_lock);
+
+ spin_lock(&tbl->gates_lock);
+ hlist_add_head_rcu(&mpath->gate_list, &tbl->known_gates);
+ spin_unlock(&tbl->gates_lock);
+
+ spin_unlock_bh(&mpath->state_lock);
+
mpath_dbg(mpath->sdata,
"Mesh path: Recorded new gate: %pM. %d known gates\n",
mpath->dst, mpath->sdata->u.mesh.num_gates);
@@ -485,28 +354,22 @@ err_rcu:
* mesh_gate_del - remove a mesh gate from the list of known gates
* @tbl: table which holds our list of known gates
* @mpath: gate mpath
- *
- * Locking: must be called inside rcu_read_lock() section
*/
static void mesh_gate_del(struct mesh_table *tbl, struct mesh_path *mpath)
{
- struct mpath_node *gate;
- struct hlist_node *q;
+ lockdep_assert_held(&mpath->state_lock);
+ if (!mpath->is_gate)
+ return;
- hlist_for_each_entry_safe(gate, q, tbl->known_gates, list) {
- if (gate->mpath != mpath)
- continue;
- spin_lock_bh(&tbl->gates_lock);
- hlist_del_rcu(&gate->list);
- kfree_rcu(gate, rcu);
- spin_unlock_bh(&tbl->gates_lock);
- mpath->sdata->u.mesh.num_gates--;
- mpath->is_gate = false;
- mpath_dbg(mpath->sdata,
- "Mesh path: Deleted gate: %pM. %d known gates\n",
- mpath->dst, mpath->sdata->u.mesh.num_gates);
- break;
- }
+ mpath->is_gate = false;
+ spin_lock_bh(&tbl->gates_lock);
+ hlist_del_rcu(&mpath->gate_list);
+ mpath->sdata->u.mesh.num_gates--;
+ spin_unlock_bh(&tbl->gates_lock);
+
+ mpath_dbg(mpath->sdata,
+ "Mesh path: Deleted gate: %pM. %d known gates\n",
+ mpath->dst, mpath->sdata->u.mesh.num_gates);
}
/**
@@ -518,6 +381,31 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata)
return sdata->u.mesh.num_gates;
}
+static
+struct mesh_path *mesh_path_new(struct ieee80211_sub_if_data *sdata,
+ const u8 *dst, gfp_t gfp_flags)
+{
+ struct mesh_path *new_mpath;
+
+ new_mpath = kzalloc(sizeof(struct mesh_path), gfp_flags);
+ if (!new_mpath)
+ return NULL;
+
+ memcpy(new_mpath->dst, dst, ETH_ALEN);
+ eth_broadcast_addr(new_mpath->rann_snd_addr);
+ new_mpath->is_root = false;
+ new_mpath->sdata = sdata;
+ new_mpath->flags = 0;
+ skb_queue_head_init(&new_mpath->frame_queue);
+ new_mpath->timer.data = (unsigned long) new_mpath;
+ new_mpath->timer.function = mesh_path_timer;
+ new_mpath->exp_time = jiffies;
+ spin_lock_init(&new_mpath->state_lock);
+ init_timer(&new_mpath->timer);
+
+ return new_mpath;
+}
+
/**
* mesh_path_add - allocate and add a new path to the mesh path table
* @dst: destination address of the path (ETH_ALEN length)
@@ -530,15 +418,9 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata)
struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
const u8 *dst)
{
- struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
- struct ieee80211_local *local = sdata->local;
struct mesh_table *tbl;
struct mesh_path *mpath, *new_mpath;
- struct mpath_node *node, *new_node;
- struct hlist_head *bucket;
- int grow = 0;
- int err;
- u32 hash_idx;
+ int ret;
if (ether_addr_equal(dst, sdata->vif.addr))
/* never add ourselves as neighbours */
@@ -550,129 +432,44 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0)
return ERR_PTR(-ENOSPC);
- read_lock_bh(&pathtbl_resize_lock);
- tbl = resize_dereference_mesh_paths();
-
- hash_idx = mesh_table_hash(dst, sdata, tbl);
- bucket = &tbl->hash_buckets[hash_idx];
-
- spin_lock(&tbl->hashwlock[hash_idx]);
-
- hlist_for_each_entry(node, bucket, list) {
- mpath = node->mpath;
- if (mpath->sdata == sdata &&
- ether_addr_equal(dst, mpath->dst))
- goto found;
- }
-
- err = -ENOMEM;
- new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC);
+ new_mpath = mesh_path_new(sdata, dst, GFP_ATOMIC);
if (!new_mpath)
- goto err_path_alloc;
-
- new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
- if (!new_node)
- goto err_node_alloc;
-
- memcpy(new_mpath->dst, dst, ETH_ALEN);
- eth_broadcast_addr(new_mpath->rann_snd_addr);
- new_mpath->is_root = false;
- new_mpath->sdata = sdata;
- new_mpath->flags = 0;
- skb_queue_head_init(&new_mpath->frame_queue);
- new_node->mpath = new_mpath;
- new_mpath->timer.data = (unsigned long) new_mpath;
- new_mpath->timer.function = mesh_path_timer;
- new_mpath->exp_time = jiffies;
- spin_lock_init(&new_mpath->state_lock);
- init_timer(&new_mpath->timer);
-
- hlist_add_head_rcu(&new_node->list, bucket);
- if (atomic_inc_return(&tbl->entries) >=
- MEAN_CHAIN_LEN * (tbl->hash_mask + 1))
- grow = 1;
-
- mesh_paths_generation++;
-
- if (grow) {
- set_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags);
- ieee80211_queue_work(&local->hw, &sdata->work);
- }
- mpath = new_mpath;
-found:
- spin_unlock(&tbl->hashwlock[hash_idx]);
- read_unlock_bh(&pathtbl_resize_lock);
- return mpath;
+ return ERR_PTR(-ENOMEM);
-err_node_alloc:
- kfree(new_mpath);
-err_path_alloc:
- atomic_dec(&sdata->u.mesh.mpaths);
- spin_unlock(&tbl->hashwlock[hash_idx]);
- read_unlock_bh(&pathtbl_resize_lock);
- return ERR_PTR(err);
-}
+ tbl = sdata->u.mesh.mesh_paths;
+ do {
+ ret = rhashtable_lookup_insert_fast(&tbl->rhead,
+ &new_mpath->rhash,
+ mesh_rht_params);
-static void mesh_table_free_rcu(struct rcu_head *rcu)
-{
- struct mesh_table *tbl = container_of(rcu, struct mesh_table, rcu_head);
+ if (ret == -EEXIST)
+ mpath = rhashtable_lookup_fast(&tbl->rhead,
+ dst,
+ mesh_rht_params);
- mesh_table_free(tbl, false);
-}
+ } while (unlikely(ret == -EEXIST && !mpath));
-void mesh_mpath_table_grow(void)
-{
- struct mesh_table *oldtbl, *newtbl;
-
- write_lock_bh(&pathtbl_resize_lock);
- oldtbl = resize_dereference_mesh_paths();
- newtbl = mesh_table_alloc(oldtbl->size_order + 1);
- if (!newtbl)
- goto out;
- if (mesh_table_grow(oldtbl, newtbl) < 0) {
- __mesh_table_free(newtbl);
- goto out;
- }
- rcu_assign_pointer(mesh_paths, newtbl);
-
- call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu);
-
- out:
- write_unlock_bh(&pathtbl_resize_lock);
-}
+ if (ret && ret != -EEXIST)
+ return ERR_PTR(ret);
-void mesh_mpp_table_grow(void)
-{
- struct mesh_table *oldtbl, *newtbl;
-
- write_lock_bh(&pathtbl_resize_lock);
- oldtbl = resize_dereference_mpp_paths();
- newtbl = mesh_table_alloc(oldtbl->size_order + 1);
- if (!newtbl)
- goto out;
- if (mesh_table_grow(oldtbl, newtbl) < 0) {
- __mesh_table_free(newtbl);
- goto out;
+ /* At this point either new_mpath was added, or we found a
+ * matching entry already in the table; in the latter case
+ * free the unnecessary new entry.
+ */
+ if (ret == -EEXIST) {
+ kfree(new_mpath);
+ new_mpath = mpath;
}
- rcu_assign_pointer(mpp_paths, newtbl);
- call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu);
-
- out:
- write_unlock_bh(&pathtbl_resize_lock);
+ sdata->u.mesh.mesh_paths_generation++;
+ return new_mpath;
}
int mpp_path_add(struct ieee80211_sub_if_data *sdata,
const u8 *dst, const u8 *mpp)
{
- struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
- struct ieee80211_local *local = sdata->local;
struct mesh_table *tbl;
- struct mesh_path *mpath, *new_mpath;
- struct mpath_node *node, *new_node;
- struct hlist_head *bucket;
- int grow = 0;
- int err = 0;
- u32 hash_idx;
+ struct mesh_path *new_mpath;
+ int ret;
if (ether_addr_equal(dst, sdata->vif.addr))
/* never add ourselves as neighbours */
@@ -681,65 +478,19 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata,
if (is_multicast_ether_addr(dst))
return -ENOTSUPP;
- err = -ENOMEM;
- new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC);
- if (!new_mpath)
- goto err_path_alloc;
+ new_mpath = mesh_path_new(sdata, dst, GFP_ATOMIC);
- new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
- if (!new_node)
- goto err_node_alloc;
+ if (!new_mpath)
+ return -ENOMEM;
- read_lock_bh(&pathtbl_resize_lock);
- memcpy(new_mpath->dst, dst, ETH_ALEN);
memcpy(new_mpath->mpp, mpp, ETH_ALEN);
- new_mpath->sdata = sdata;
- new_mpath->flags = 0;
- skb_queue_head_init(&new_mpath->frame_queue);
- new_node->mpath = new_mpath;
- init_timer(&new_mpath->timer);
- new_mpath->exp_time = jiffies;
- spin_lock_init(&new_mpath->state_lock);
-
- tbl = resize_dereference_mpp_paths();
+ tbl = sdata->u.mesh.mpp_paths;
+ ret = rhashtable_lookup_insert_fast(&tbl->rhead,
+ &new_mpath->rhash,
+ mesh_rht_params);
- hash_idx = mesh_table_hash(dst, sdata, tbl);
- bucket = &tbl->hash_buckets[hash_idx];
-
- spin_lock(&tbl->hashwlock[hash_idx]);
-
- err = -EEXIST;
- hlist_for_each_entry(node, bucket, list) {
- mpath = node->mpath;
- if (mpath->sdata == sdata &&
- ether_addr_equal(dst, mpath->dst))
- goto err_exists;
- }
-
- hlist_add_head_rcu(&new_node->list, bucket);
- if (atomic_inc_return(&tbl->entries) >=
- MEAN_CHAIN_LEN * (tbl->hash_mask + 1))
- grow = 1;
-
- spin_unlock(&tbl->hashwlock[hash_idx]);
- read_unlock_bh(&pathtbl_resize_lock);
-
- mpp_paths_generation++;
-
- if (grow) {
- set_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags);
- ieee80211_queue_work(&local->hw, &sdata->work);
- }
- return 0;
-
-err_exists:
- spin_unlock(&tbl->hashwlock[hash_idx]);
- read_unlock_bh(&pathtbl_resize_lock);
- kfree(new_node);
-err_node_alloc:
- kfree(new_mpath);
-err_path_alloc:
- return err;
+ sdata->u.mesh.mpp_paths_generation++;
+ return ret;
}
@@ -753,17 +504,26 @@ err_path_alloc:
*/
void mesh_plink_broken(struct sta_info *sta)
{
- struct mesh_table *tbl;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct mesh_table *tbl = sdata->u.mesh.mesh_paths;
static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
struct mesh_path *mpath;
- struct mpath_node *node;
- struct ieee80211_sub_if_data *sdata = sta->sdata;
- int i;
+ struct rhashtable_iter iter;
+ int ret;
- rcu_read_lock();
- tbl = rcu_dereference(mesh_paths);
- for_each_mesh_entry(tbl, node, i) {
- mpath = node->mpath;
+ ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
+ if (ret)
+ return;
+
+ ret = rhashtable_walk_start(&iter);
+ if (ret && ret != -EAGAIN)
+ goto out;
+
+ while ((mpath = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
+ continue;
+ if (IS_ERR(mpath))
+ break;
if (rcu_access_pointer(mpath->next_hop) == sta &&
mpath->flags & MESH_PATH_ACTIVE &&
!(mpath->flags & MESH_PATH_FIXED)) {
@@ -777,33 +537,30 @@ void mesh_plink_broken(struct sta_info *sta)
WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast);
}
}
- rcu_read_unlock();
+out:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
}
-static void mesh_path_node_reclaim(struct rcu_head *rp)
+static void mesh_path_free_rcu(struct mesh_table *tbl,
+ struct mesh_path *mpath)
{
- struct mpath_node *node = container_of(rp, struct mpath_node, rcu);
+ struct ieee80211_sub_if_data *sdata = mpath->sdata;
- del_timer_sync(&node->mpath->timer);
- kfree(node->mpath);
- kfree(node);
+ spin_lock_bh(&mpath->state_lock);
+ mpath->flags |= MESH_PATH_RESOLVING | MESH_PATH_DELETED;
+ mesh_gate_del(tbl, mpath);
+ spin_unlock_bh(&mpath->state_lock);
+ del_timer_sync(&mpath->timer);
+ atomic_dec(&sdata->u.mesh.mpaths);
+ atomic_dec(&tbl->entries);
+ kfree_rcu(mpath, rcu);
}
-/* needs to be called with the corresponding hashwlock taken */
-static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
+static void __mesh_path_del(struct mesh_table *tbl, struct mesh_path *mpath)
{
- struct mesh_path *mpath = node->mpath;
- struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
-
- spin_lock(&mpath->state_lock);
- mpath->flags |= MESH_PATH_RESOLVING;
- if (mpath->is_gate)
- mesh_gate_del(tbl, mpath);
- hlist_del_rcu(&node->list);
- call_rcu(&node->rcu, mesh_path_node_reclaim);
- spin_unlock(&mpath->state_lock);
- atomic_dec(&sdata->u.mesh.mpaths);
- atomic_dec(&tbl->entries);
+ rhashtable_remove_fast(&tbl->rhead, &mpath->rhash, mesh_rht_params);
+ mesh_path_free_rcu(tbl, mpath);
}
/**
@@ -819,65 +576,88 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
*/
void mesh_path_flush_by_nexthop(struct sta_info *sta)
{
- struct mesh_table *tbl;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct mesh_table *tbl = sdata->u.mesh.mesh_paths;
struct mesh_path *mpath;
- struct mpath_node *node;
- int i;
+ struct rhashtable_iter iter;
+ int ret;
- rcu_read_lock();
- read_lock_bh(&pathtbl_resize_lock);
- tbl = resize_dereference_mesh_paths();
- for_each_mesh_entry(tbl, node, i) {
- mpath = node->mpath;
- if (rcu_access_pointer(mpath->next_hop) == sta) {
- spin_lock(&tbl->hashwlock[i]);
- __mesh_path_del(tbl, node);
- spin_unlock(&tbl->hashwlock[i]);
- }
+ ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
+ if (ret)
+ return;
+
+ ret = rhashtable_walk_start(&iter);
+ if (ret && ret != -EAGAIN)
+ goto out;
+
+ while ((mpath = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
+ continue;
+ if (IS_ERR(mpath))
+ break;
+
+ if (rcu_access_pointer(mpath->next_hop) == sta)
+ __mesh_path_del(tbl, mpath);
}
- read_unlock_bh(&pathtbl_resize_lock);
- rcu_read_unlock();
+out:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
}
static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
const u8 *proxy)
{
- struct mesh_table *tbl;
- struct mesh_path *mpp;
- struct mpath_node *node;
- int i;
+ struct mesh_table *tbl = sdata->u.mesh.mpp_paths;
+ struct mesh_path *mpath;
+ struct rhashtable_iter iter;
+ int ret;
- rcu_read_lock();
- read_lock_bh(&pathtbl_resize_lock);
- tbl = resize_dereference_mpp_paths();
- for_each_mesh_entry(tbl, node, i) {
- mpp = node->mpath;
- if (ether_addr_equal(mpp->mpp, proxy)) {
- spin_lock(&tbl->hashwlock[i]);
- __mesh_path_del(tbl, node);
- spin_unlock(&tbl->hashwlock[i]);
- }
+ ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
+ if (ret)
+ return;
+
+ ret = rhashtable_walk_start(&iter);
+ if (ret && ret != -EAGAIN)
+ goto out;
+
+ while ((mpath = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
+ continue;
+ if (IS_ERR(mpath))
+ break;
+
+ if (ether_addr_equal(mpath->mpp, proxy))
+ __mesh_path_del(tbl, mpath);
}
- read_unlock_bh(&pathtbl_resize_lock);
- rcu_read_unlock();
+out:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
}
-static void table_flush_by_iface(struct mesh_table *tbl,
- struct ieee80211_sub_if_data *sdata)
+static void table_flush_by_iface(struct mesh_table *tbl)
{
struct mesh_path *mpath;
- struct mpath_node *node;
- int i;
+ struct rhashtable_iter iter;
+ int ret;
+
+ ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
+ if (ret)
+ return;
+
+ ret = rhashtable_walk_start(&iter);
+ if (ret && ret != -EAGAIN)
+ goto out;
- WARN_ON(!rcu_read_lock_held());
- for_each_mesh_entry(tbl, node, i) {
- mpath = node->mpath;
- if (mpath->sdata != sdata)
+ while ((mpath = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
continue;
- spin_lock_bh(&tbl->hashwlock[i]);
- __mesh_path_del(tbl, node);
- spin_unlock_bh(&tbl->hashwlock[i]);
+ if (IS_ERR(mpath))
+ break;
+ __mesh_path_del(tbl, mpath);
}
+out:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
}
/**
@@ -890,16 +670,8 @@ static void table_flush_by_iface(struct mesh_table *tbl,
*/
void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
{
- struct mesh_table *tbl;
-
- rcu_read_lock();
- read_lock_bh(&pathtbl_resize_lock);
- tbl = resize_dereference_mesh_paths();
- table_flush_by_iface(tbl, sdata);
- tbl = resize_dereference_mpp_paths();
- table_flush_by_iface(tbl, sdata);
- read_unlock_bh(&pathtbl_resize_lock);
- rcu_read_unlock();
+ table_flush_by_iface(sdata->u.mesh.mesh_paths);
+ table_flush_by_iface(sdata->u.mesh.mpp_paths);
}
/**
@@ -911,37 +683,25 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
*
* Returns: 0 if successful
*/
-static int table_path_del(struct mesh_table __rcu *rcu_tbl,
+static int table_path_del(struct mesh_table *tbl,
struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
- struct mesh_table *tbl;
struct mesh_path *mpath;
- struct mpath_node *node;
- struct hlist_head *bucket;
- int hash_idx;
- int err = 0;
-
- tbl = resize_dereference_paths(rcu_tbl);
- hash_idx = mesh_table_hash(addr, sdata, tbl);
- bucket = &tbl->hash_buckets[hash_idx];
-
- spin_lock(&tbl->hashwlock[hash_idx]);
- hlist_for_each_entry(node, bucket, list) {
- mpath = node->mpath;
- if (mpath->sdata == sdata &&
- ether_addr_equal(addr, mpath->dst)) {
- __mesh_path_del(tbl, node);
- goto enddel;
- }
+
+ rcu_read_lock();
+ mpath = rhashtable_lookup_fast(&tbl->rhead, addr, mesh_rht_params);
+ if (!mpath) {
+ rcu_read_unlock();
+ return -ENXIO;
}
- err = -ENXIO;
-enddel:
- spin_unlock(&tbl->hashwlock[hash_idx]);
- return err;
+ __mesh_path_del(tbl, mpath);
+ rcu_read_unlock();
+ return 0;
}
+
/**
* mesh_path_del - delete a mesh path from the table
*
@@ -952,36 +712,13 @@ enddel:
*/
int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
{
- int err = 0;
+ int err;
/* flush relevant mpp entries first */
mpp_flush_by_proxy(sdata, addr);
- read_lock_bh(&pathtbl_resize_lock);
- err = table_path_del(mesh_paths, sdata, addr);
- mesh_paths_generation++;
- read_unlock_bh(&pathtbl_resize_lock);
-
- return err;
-}
-
-/**
- * mpp_path_del - delete a mesh proxy path from the table
- *
- * @addr: addr address (ETH_ALEN length)
- * @sdata: local subif
- *
- * Returns: 0 if successful
- */
-static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
-{
- int err = 0;
-
- read_lock_bh(&pathtbl_resize_lock);
- err = table_path_del(mpp_paths, sdata, addr);
- mpp_paths_generation++;
- read_unlock_bh(&pathtbl_resize_lock);
-
+ err = table_path_del(sdata->u.mesh.mesh_paths, sdata, addr);
+ sdata->u.mesh.mesh_paths_generation++;
return err;
}
@@ -1015,39 +752,30 @@ int mesh_path_send_to_gates(struct mesh_path *mpath)
struct ieee80211_sub_if_data *sdata = mpath->sdata;
struct mesh_table *tbl;
struct mesh_path *from_mpath = mpath;
- struct mpath_node *gate = NULL;
+ struct mesh_path *gate;
bool copy = false;
- struct hlist_head *known_gates;
-
- rcu_read_lock();
- tbl = rcu_dereference(mesh_paths);
- known_gates = tbl->known_gates;
- rcu_read_unlock();
- if (!known_gates)
- return -EHOSTUNREACH;
+ tbl = sdata->u.mesh.mesh_paths;
- hlist_for_each_entry_rcu(gate, known_gates, list) {
- if (gate->mpath->sdata != sdata)
- continue;
-
- if (gate->mpath->flags & MESH_PATH_ACTIVE) {
- mpath_dbg(sdata, "Forwarding to %pM\n", gate->mpath->dst);
- mesh_path_move_to_queue(gate->mpath, from_mpath, copy);
- from_mpath = gate->mpath;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gate, &tbl->known_gates, gate_list) {
+ if (gate->flags & MESH_PATH_ACTIVE) {
+ mpath_dbg(sdata, "Forwarding to %pM\n", gate->dst);
+ mesh_path_move_to_queue(gate, from_mpath, copy);
+ from_mpath = gate;
copy = true;
} else {
mpath_dbg(sdata,
"Not forwarding to %pM (flags %#x)\n",
- gate->mpath->dst, gate->mpath->flags);
+ gate->dst, gate->flags);
}
}
- hlist_for_each_entry_rcu(gate, known_gates, list)
- if (gate->mpath->sdata == sdata) {
- mpath_dbg(sdata, "Sending to %pM\n", gate->mpath->dst);
- mesh_path_tx_pending(gate->mpath);
- }
+ hlist_for_each_entry_rcu(gate, &tbl->known_gates, gate_list) {
+ mpath_dbg(sdata, "Sending to %pM\n", gate->dst);
+ mesh_path_tx_pending(gate);
+ }
+ rcu_read_unlock();
return (from_mpath == mpath) ? -EHOSTUNREACH : 0;
}
@@ -1104,118 +832,73 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
mesh_path_tx_pending(mpath);
}
-static void mesh_path_node_free(struct hlist_node *p, bool free_leafs)
-{
- struct mesh_path *mpath;
- struct mpath_node *node = hlist_entry(p, struct mpath_node, list);
- mpath = node->mpath;
- hlist_del_rcu(p);
- if (free_leafs) {
- del_timer_sync(&mpath->timer);
- kfree(mpath);
- }
- kfree(node);
-}
-
-static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl)
-{
- struct mesh_path *mpath;
- struct mpath_node *node, *new_node;
- u32 hash_idx;
-
- new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
- if (new_node == NULL)
- return -ENOMEM;
-
- node = hlist_entry(p, struct mpath_node, list);
- mpath = node->mpath;
- new_node->mpath = mpath;
- hash_idx = mesh_table_hash(mpath->dst, mpath->sdata, newtbl);
- hlist_add_head(&new_node->list,
- &newtbl->hash_buckets[hash_idx]);
- return 0;
-}
-
-int mesh_pathtbl_init(void)
+int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata)
{
struct mesh_table *tbl_path, *tbl_mpp;
int ret;
- tbl_path = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+ tbl_path = mesh_table_alloc();
if (!tbl_path)
return -ENOMEM;
- tbl_path->free_node = &mesh_path_node_free;
- tbl_path->copy_node = &mesh_path_node_copy;
- tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
- if (!tbl_path->known_gates) {
- ret = -ENOMEM;
- goto free_path;
- }
- INIT_HLIST_HEAD(tbl_path->known_gates);
-
- tbl_mpp = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+ tbl_mpp = mesh_table_alloc();
if (!tbl_mpp) {
ret = -ENOMEM;
goto free_path;
}
- tbl_mpp->free_node = &mesh_path_node_free;
- tbl_mpp->copy_node = &mesh_path_node_copy;
- tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
- if (!tbl_mpp->known_gates) {
- ret = -ENOMEM;
- goto free_mpp;
- }
- INIT_HLIST_HEAD(tbl_mpp->known_gates);
- /* Need no locking since this is during init */
- RCU_INIT_POINTER(mesh_paths, tbl_path);
- RCU_INIT_POINTER(mpp_paths, tbl_mpp);
+ rhashtable_init(&tbl_path->rhead, &mesh_rht_params);
+ rhashtable_init(&tbl_mpp->rhead, &mesh_rht_params);
+
+ sdata->u.mesh.mesh_paths = tbl_path;
+ sdata->u.mesh.mpp_paths = tbl_mpp;
return 0;
-free_mpp:
- mesh_table_free(tbl_mpp, true);
free_path:
- mesh_table_free(tbl_path, true);
+ mesh_table_free(tbl_path);
return ret;
}
-void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
+static
+void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
+ struct mesh_table *tbl)
{
- struct mesh_table *tbl;
struct mesh_path *mpath;
- struct mpath_node *node;
- int i;
+ struct rhashtable_iter iter;
+ int ret;
- rcu_read_lock();
- tbl = rcu_dereference(mesh_paths);
- for_each_mesh_entry(tbl, node, i) {
- if (node->mpath->sdata != sdata)
+ ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_KERNEL);
+ if (ret)
+ return;
+
+ ret = rhashtable_walk_start(&iter);
+ if (ret && ret != -EAGAIN)
+ goto out;
+
+ while ((mpath = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
continue;
- mpath = node->mpath;
+ if (IS_ERR(mpath))
+ break;
if ((!(mpath->flags & MESH_PATH_RESOLVING)) &&
(!(mpath->flags & MESH_PATH_FIXED)) &&
time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
- mesh_path_del(mpath->sdata, mpath->dst);
- }
-
- tbl = rcu_dereference(mpp_paths);
- for_each_mesh_entry(tbl, node, i) {
- if (node->mpath->sdata != sdata)
- continue;
- mpath = node->mpath;
- if ((!(mpath->flags & MESH_PATH_FIXED)) &&
- time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
- mpp_path_del(mpath->sdata, mpath->dst);
+ __mesh_path_del(tbl, mpath);
}
+out:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
+}
- rcu_read_unlock();
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
+{
+ mesh_path_tbl_expire(sdata, sdata->u.mesh.mesh_paths);
+ mesh_path_tbl_expire(sdata, sdata->u.mesh.mpp_paths);
}
-void mesh_pathtbl_unregister(void)
+void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata)
{
- /* no need for locking during exit path */
- mesh_table_free(rcu_dereference_protected(mesh_paths, 1), true);
- mesh_table_free(rcu_dereference_protected(mpp_paths, 1), true);
+ mesh_table_free(sdata->u.mesh.mesh_paths);
+ mesh_table_free(sdata->u.mesh.mpp_paths);
}
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index a07e93c21..79f2a0a13 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -61,7 +61,7 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata,
s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold;
return rssi_threshold == 0 ||
(sta &&
- (s8)-ewma_signal_read(&sta->rx_stats.avg_signal) >
+ (s8)-ewma_signal_read(&sta->rx_stats_avg.signal) >
rssi_threshold);
}
@@ -93,18 +93,18 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta)
static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
struct sta_info *sta;
u32 erp_rates = 0, changed = 0;
int i;
bool short_slot = false;
- if (band == IEEE80211_BAND_5GHZ) {
+ if (band == NL80211_BAND_5GHZ) {
/* (IEEE 802.11-2012 19.4.5) */
short_slot = true;
goto out;
- } else if (band != IEEE80211_BAND_2GHZ)
+ } else if (band != NL80211_BAND_2GHZ)
goto out;
for (i = 0; i < sband->n_bitrates; i++)
@@ -247,7 +247,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
mgmt->u.action.u.self_prot.action_code = action;
if (action != WLAN_SP_MESH_PEERING_CLOSE) {
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
/* capability info */
pos = skb_put(skb, 2);
@@ -331,7 +331,9 @@ free:
*
* @sta: mesh peer link to deactivate
*
- * All mesh paths with this peer as next hop will be flushed
+ * Mesh paths with this peer as next hop should be flushed
+ * by the caller outside of plink_lock.
+ *
* Returns beacon changed flag if the beacon content changed.
*
* Locking: the caller must hold sta->mesh->plink_lock
@@ -346,7 +348,6 @@ static u32 __mesh_plink_deactivate(struct sta_info *sta)
if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
changed = mesh_plink_dec_estab_count(sdata);
sta->mesh->plink_state = NL80211_PLINK_BLOCKED;
- mesh_path_flush_by_nexthop(sta);
ieee80211_mps_sta_status_update(sta);
changed |= ieee80211_mps_set_sta_local_pm(sta,
@@ -374,6 +375,7 @@ u32 mesh_plink_deactivate(struct sta_info *sta)
sta->sta.addr, sta->mesh->llid, sta->mesh->plid,
sta->mesh->reason);
spin_unlock_bh(&sta->mesh->plink_lock);
+ mesh_path_flush_by_nexthop(sta);
return changed;
}
@@ -383,7 +385,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems, bool insert)
{
struct ieee80211_local *local = sdata->local;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_supported_band *sband;
u32 rates, basic_rates = 0, changed = 0;
enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth;
@@ -748,6 +750,7 @@ u32 mesh_plink_block(struct sta_info *sta)
changed = __mesh_plink_deactivate(sta);
sta->mesh->plink_state = NL80211_PLINK_BLOCKED;
spin_unlock_bh(&sta->mesh->plink_lock);
+ mesh_path_flush_by_nexthop(sta);
return changed;
}
@@ -797,6 +800,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg;
enum ieee80211_self_protected_actioncode action = 0;
u32 changed = 0;
+ bool flush = false;
mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr,
mplstates[sta->mesh->plink_state], mplevents[event]);
@@ -885,6 +889,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
changed |= mesh_set_short_slot_time(sdata);
mesh_plink_close(sdata, sta, event);
action = WLAN_SP_MESH_PEERING_CLOSE;
+ flush = true;
break;
case OPN_ACPT:
action = WLAN_SP_MESH_PEERING_CONFIRM;
@@ -916,6 +921,8 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
break;
}
spin_unlock_bh(&sta->mesh->plink_lock);
+ if (flush)
+ mesh_path_flush_by_nexthop(sta);
if (action) {
mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr,
sta->mesh->llid, sta->mesh->plid,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 281b8d6e5..8d426f637 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -122,15 +122,16 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- if (unlikely(!sdata->u.mgd.associated))
+ if (unlikely(!ifmgd->associated))
return;
- ifmgd->probe_send_count = 0;
+ if (ifmgd->probe_send_count)
+ ifmgd->probe_send_count = 0;
if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
return;
- mod_timer(&sdata->u.mgd.conn_mon_timer,
+ mod_timer(&ifmgd->conn_mon_timer,
round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
}
@@ -660,7 +661,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
capab = WLAN_CAPABILITY_ESS;
- if (sband->band == IEEE80211_BAND_2GHZ) {
+ if (sband->band == NL80211_BAND_2GHZ) {
capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
}
@@ -1099,7 +1100,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss = ifmgd->associated;
struct ieee80211_chanctx_conf *conf;
struct ieee80211_chanctx *chanctx;
- enum ieee80211_band current_band;
+ enum nl80211_band current_band;
struct ieee80211_csa_ie csa_ie;
struct ieee80211_channel_switch ch_switch;
int res;
@@ -1256,11 +1257,11 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata,
default:
WARN_ON_ONCE(1);
/* fall through */
- case IEEE80211_BAND_2GHZ:
- case IEEE80211_BAND_60GHZ:
+ case NL80211_BAND_2GHZ:
+ case NL80211_BAND_60GHZ:
chan_increment = 1;
break;
- case IEEE80211_BAND_5GHZ:
+ case NL80211_BAND_5GHZ:
chan_increment = 4;
break;
}
@@ -1860,7 +1861,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
}
use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME);
- if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_5GHZ)
+ if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_5GHZ)
use_short_slot = true;
if (use_protection != bss_conf->use_cts_prot) {
@@ -2216,6 +2217,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
const u8 *ssid;
u8 *dst = ifmgd->associated->bssid;
u8 unicast_limit = max(1, max_probe_tries - 3);
+ struct sta_info *sta;
/*
* Try sending broadcast probe requests for the last three
@@ -2234,6 +2236,14 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
*/
ifmgd->probe_send_count++;
+ if (dst) {
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get(sdata, dst);
+ if (!WARN_ON(!sta))
+ ieee80211_check_fast_rx(sta);
+ mutex_unlock(&sdata->local->sta_mtx);
+ }
+
if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) {
ifmgd->nullfunc_failed = false;
ieee80211_send_nullfunc(sdata->local, sdata, false);
@@ -2389,6 +2399,11 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
return;
}
+ /* AP is probably out of range (or not reachable for another reason) so
+ * remove the bss struct for that AP.
+ */
+ cfg80211_unlink_bss(local->hw.wiphy, ifmgd->associated);
+
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
true, frame_buf);
@@ -4365,7 +4380,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.basic_rates = basic_rates;
/* cf. IEEE 802.11 9.2.12 */
- if (cbss->channel->band == IEEE80211_BAND_2GHZ &&
+ if (cbss->channel->band == NL80211_BAND_2GHZ &&
have_higher_than_11mbit)
sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
else
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index 0be0aadfc..88e6ebbbe 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -75,8 +75,6 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
if (!sta)
return;
- sta->rx_stats.last_rx = jiffies;
-
/* Add only mandatory rates for now */
sband = local->hw.wiphy->bands[band];
sta->sta.supp_rates[band] =
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index a4e2f4e67..206698bc9 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -287,7 +287,7 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
u32 rate_flags =
ieee80211_chandef_rate_flags(&hw->conf.chandef);
- if ((sband->band == IEEE80211_BAND_2GHZ) &&
+ if ((sband->band == NL80211_BAND_2GHZ) &&
(info->flags & IEEE80211_TX_CTL_NO_CCK_RATE))
rate_flags |= IEEE80211_RATE_ERP_G;
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 624fe5b81..8d3260785 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -96,9 +96,9 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta)
{
#ifdef CONFIG_MAC80211_DEBUGFS
struct rate_control_ref *ref = sta->rate_ctrl;
- if (ref && sta->debugfs.dir && ref->ops->add_sta_debugfs)
+ if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs)
ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv,
- sta->debugfs.dir);
+ sta->debugfs_dir);
#endif
}
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index b54f398cd..14c5ba3a1 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -436,7 +436,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
static void
-calc_rate_durations(enum ieee80211_band band,
+calc_rate_durations(enum nl80211_band band,
struct minstrel_rate *d,
struct ieee80211_rate *rate,
struct cfg80211_chan_def *chandef)
@@ -579,7 +579,7 @@ minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
if (!mi)
return NULL;
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
sband = hw->wiphy->bands[i];
if (sband && sband->n_bitrates > max_rates)
max_rates = sband->n_bitrates;
@@ -621,7 +621,7 @@ minstrel_init_cck_rates(struct minstrel_priv *mp)
u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
int i, j;
- sband = mp->hw->wiphy->bands[IEEE80211_BAND_2GHZ];
+ sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ];
if (!sband)
return;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 370d677b5..30fbabf4b 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -883,6 +883,59 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
ratetbl->rate[offset].flags = flags;
}
+static inline int
+minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate)
+{
+ int group = rate / MCS_GROUP_RATES;
+ rate %= MCS_GROUP_RATES;
+ return mi->groups[group].rates[rate].prob_ewma;
+}
+
+static int
+minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
+{
+ int group = mi->max_prob_rate / MCS_GROUP_RATES;
+ const struct mcs_group *g = &minstrel_mcs_groups[group];
+ int rate = mi->max_prob_rate % MCS_GROUP_RATES;
+
+ /* Disable A-MSDU if max_prob_rate is bad */
+ if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100))
+ return 1;
+
+ /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */
+ if (g->duration[rate] > MCS_DURATION(1, 0, 52))
+ return 500;
+
+ /*
+ * If the rate is slower than single-stream MCS4, limit A-MSDU to usual
+ * data packet size
+ */
+ if (g->duration[rate] > MCS_DURATION(1, 0, 104))
+ return 1600;
+
+ /*
+ * If the rate is slower than single-stream MCS7, or if the max throughput
+ * rate success probability is less than 75%, limit A-MSDU to twice the usual
+ * data packet size
+ */
+ if (g->duration[rate] > MCS_DURATION(1, 0, 260) ||
+ (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) <
+ MINSTREL_FRAC(75, 100)))
+ return 3200;
+
+ /*
+ * HT A-MPDU limits maximum MPDU size under BA agreement to 4095 bytes.
+ * Since aggregation sessions are started/stopped without txq flush, use
+ * the limit here to avoid the complexity of having to de-aggregate
+ * packets in the queue.
+ */
+ if (!mi->sta->vht_cap.vht_supported)
+ return IEEE80211_MAX_MPDU_LEN_HT_BA;
+
+ /* unlimited */
+ return 0;
+}
+
static void
minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
{
@@ -907,6 +960,7 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate);
}
+ mi->sta->max_rc_amsdu_len = minstrel_ht_get_max_amsdu_len(mi);
rates->rate[i].idx = -1;
rate_control_set_rates(mp->hw, mi->sta, rates);
}
@@ -924,6 +978,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
struct minstrel_rate_stats *mrs;
struct minstrel_mcs_group_data *mg;
unsigned int sample_dur, sample_group, cur_max_tp_streams;
+ int tp_rate1, tp_rate2;
int sample_idx = 0;
if (mi->sample_wait > 0) {
@@ -945,14 +1000,22 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
mrs = &mg->rates[sample_idx];
sample_idx += sample_group * MCS_GROUP_RATES;
+ /* Set tp_rate1, tp_rate2 to the highest / second highest max_tp_rate */
+ if (minstrel_get_duration(mi->max_tp_rate[0]) >
+ minstrel_get_duration(mi->max_tp_rate[1])) {
+ tp_rate1 = mi->max_tp_rate[1];
+ tp_rate2 = mi->max_tp_rate[0];
+ } else {
+ tp_rate1 = mi->max_tp_rate[0];
+ tp_rate2 = mi->max_tp_rate[1];
+ }
+
/*
* Sampling might add some overhead (RTS, no aggregation)
- * to the frame. Hence, don't use sampling for the currently
- * used rates.
+ * to the frame. Hence, don't use sampling for the highest currently
+ * used highest throughput or probability rate.
*/
- if (sample_idx == mi->max_tp_rate[0] ||
- sample_idx == mi->max_tp_rate[1] ||
- sample_idx == mi->max_prob_rate)
+ if (sample_idx == mi->max_tp_rate[0] || sample_idx == mi->max_prob_rate)
return -1;
/*
@@ -967,10 +1030,10 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
* if the link is working perfectly.
*/
- cur_max_tp_streams = minstrel_mcs_groups[mi->max_tp_rate[0] /
+ cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 /
MCS_GROUP_RATES].streams;
sample_dur = minstrel_get_duration(sample_idx);
- if (sample_dur >= minstrel_get_duration(mi->max_tp_rate[1]) &&
+ if (sample_dur >= minstrel_get_duration(tp_rate2) &&
(cur_max_tp_streams - 1 <
minstrel_mcs_groups[sample_group].streams ||
sample_dur >= minstrel_get_duration(mi->max_prob_rate))) {
@@ -1074,7 +1137,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
{
int i;
- if (sband->band != IEEE80211_BAND_2GHZ)
+ if (sband->band != NL80211_BAND_2GHZ)
return;
if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES))
@@ -1272,7 +1335,7 @@ minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
int max_rates = 0;
int i;
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
sband = hw->wiphy->bands[i];
if (sband && sband->n_bitrates > max_rates)
max_rates = sband->n_bitrates;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index dc27becb9..5e65e8389 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -322,7 +322,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
else if (status->flag & RX_FLAG_5MHZ)
channel_flags |= IEEE80211_CHAN_QUARTER;
- if (status->band == IEEE80211_BAND_5GHZ)
+ if (status->band == NL80211_BAND_5GHZ)
channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ;
else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ;
@@ -722,8 +722,8 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
return -1;
}
-static int iwl80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs,
- struct sk_buff *skb)
+static int ieee80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs,
+ struct sk_buff *skb)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
__le16 fc;
@@ -1319,13 +1319,52 @@ int ieee80211_sta_ps_transition(struct ieee80211_sta *pubsta, bool start)
}
EXPORT_SYMBOL(ieee80211_sta_ps_transition);
+void ieee80211_sta_pspoll(struct ieee80211_sta *pubsta)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+
+ if (test_sta_flag(sta, WLAN_STA_SP))
+ return;
+
+ if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER))
+ ieee80211_sta_ps_deliver_poll_response(sta);
+ else
+ set_sta_flag(sta, WLAN_STA_PSPOLL);
+}
+EXPORT_SYMBOL(ieee80211_sta_pspoll);
+
+void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ u8 ac = ieee802_1d_to_ac[tid & 7];
+
+ /*
+ * If this AC is not trigger-enabled do nothing.
+ *
+ * NB: This could/should check a separate bitmap of trigger-
+ * enabled queues, but for now we only implement uAPSD w/o
+ * TSPEC changes to the ACs, so they're always the same.
+ */
+ if (!(sta->sta.uapsd_queues & BIT(ac)))
+ return;
+
+ /* if we are in a service period, do nothing */
+ if (test_sta_flag(sta, WLAN_STA_SP))
+ return;
+
+ if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER))
+ ieee80211_sta_ps_deliver_uapsd(sta);
+ else
+ set_sta_flag(sta, WLAN_STA_UAPSD);
+}
+EXPORT_SYMBOL(ieee80211_sta_uapsd_trigger);
+
static ieee80211_rx_result debug_noinline
ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
{
struct ieee80211_sub_if_data *sdata = rx->sdata;
struct ieee80211_hdr *hdr = (void *)rx->skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
- int tid, ac;
if (!rx->sta)
return RX_CONTINUE;
@@ -1351,12 +1390,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) {
- if (!test_sta_flag(rx->sta, WLAN_STA_SP)) {
- if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER))
- ieee80211_sta_ps_deliver_poll_response(rx->sta);
- else
- set_sta_flag(rx->sta, WLAN_STA_PSPOLL);
- }
+ ieee80211_sta_pspoll(&rx->sta->sta);
/* Free PS Poll skb here instead of returning RX_DROP that would
* count as an dropped frame. */
@@ -1368,27 +1402,11 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
ieee80211_has_pm(hdr->frame_control) &&
(ieee80211_is_data_qos(hdr->frame_control) ||
ieee80211_is_qos_nullfunc(hdr->frame_control))) {
- tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
- ac = ieee802_1d_to_ac[tid & 7];
+ u8 tid;
- /*
- * If this AC is not trigger-enabled do nothing.
- *
- * NB: This could/should check a separate bitmap of trigger-
- * enabled queues, but for now we only implement uAPSD w/o
- * TSPEC changes to the ACs, so they're always the same.
- */
- if (!(rx->sta->sta.uapsd_queues & BIT(ac)))
- return RX_CONTINUE;
-
- /* if we are in a service period, do nothing */
- if (test_sta_flag(rx->sta, WLAN_STA_SP))
- return RX_CONTINUE;
+ tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
- if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER))
- ieee80211_sta_ps_deliver_uapsd(rx->sta);
- else
- set_sta_flag(rx->sta, WLAN_STA_UAPSD);
+ ieee80211_sta_uapsd_trigger(&rx->sta->sta, tid);
}
return RX_CONTINUE;
@@ -1421,16 +1439,9 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
sta->rx_stats.last_rx = jiffies;
if (ieee80211_is_data(hdr->frame_control) &&
- !is_multicast_ether_addr(hdr->addr1)) {
- sta->rx_stats.last_rate_idx =
- status->rate_idx;
- sta->rx_stats.last_rate_flag =
- status->flag;
- sta->rx_stats.last_rate_vht_flag =
- status->vht_flag;
- sta->rx_stats.last_rate_vht_nss =
- status->vht_nss;
- }
+ !is_multicast_ether_addr(hdr->addr1))
+ sta->rx_stats.last_rate =
+ sta_stats_encode_rate(status);
}
} else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) {
sta->rx_stats.last_rx = jiffies;
@@ -1440,22 +1451,22 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
* match the current local configuration when processed.
*/
sta->rx_stats.last_rx = jiffies;
- if (ieee80211_is_data(hdr->frame_control)) {
- sta->rx_stats.last_rate_idx = status->rate_idx;
- sta->rx_stats.last_rate_flag = status->flag;
- sta->rx_stats.last_rate_vht_flag = status->vht_flag;
- sta->rx_stats.last_rate_vht_nss = status->vht_nss;
- }
+ if (ieee80211_is_data(hdr->frame_control))
+ sta->rx_stats.last_rate = sta_stats_encode_rate(status);
}
if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
ieee80211_sta_rx_notify(rx->sdata, hdr);
sta->rx_stats.fragments++;
+
+ u64_stats_update_begin(&rx->sta->rx_stats.syncp);
sta->rx_stats.bytes += rx->skb->len;
+ u64_stats_update_end(&rx->sta->rx_stats.syncp);
+
if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
sta->rx_stats.last_signal = status->signal;
- ewma_signal_add(&sta->rx_stats.avg_signal, -status->signal);
+ ewma_signal_add(&sta->rx_stats_avg.signal, -status->signal);
}
if (status->chains) {
@@ -1467,7 +1478,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
continue;
sta->rx_stats.chain_signal_last[i] = signal;
- ewma_signal_add(&sta->rx_stats.chain_signal_avg[i],
+ ewma_signal_add(&sta->rx_stats_avg.chain_signal[i],
-signal);
}
}
@@ -1586,7 +1597,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (ieee80211_has_protected(fc) && rx->sta->cipher_scheme) {
cs = rx->sta->cipher_scheme;
- keyid = iwl80211_get_cs_keyid(cs, rx->skb);
+ keyid = ieee80211_get_cs_keyid(cs, rx->skb);
if (unlikely(keyid < 0))
return RX_DROP_UNUSABLE;
}
@@ -1670,7 +1681,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
hdrlen = ieee80211_hdrlen(fc);
if (cs) {
- keyidx = iwl80211_get_cs_keyid(cs, rx->skb);
+ keyidx = ieee80211_get_cs_keyid(cs, rx->skb);
if (unlikely(keyidx < 0))
return RX_DROP_UNUSABLE;
@@ -2129,6 +2140,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
ieee80211_rx_stats(dev, skb->len);
+ if (rx->sta) {
+ /* The seqno index has the same property as needed
+ * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
+ * for non-QoS-data frames. Here we know it's a data
+ * frame, so count MSDUs.
+ */
+ u64_stats_update_begin(&rx->sta->rx_stats.syncp);
+ rx->sta->rx_stats.msdu[rx->seqno_idx]++;
+ u64_stats_update_end(&rx->sta->rx_stats.syncp);
+ }
+
if ((sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
@@ -2415,15 +2437,6 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
return RX_DROP_MONITOR;
- if (rx->sta) {
- /* The seqno index has the same property as needed
- * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
- * for non-QoS-data frames. Here we know it's a data
- * frame, so count MSDUs.
- */
- rx->sta->rx_stats.msdu[rx->seqno_idx]++;
- }
-
/*
* Send unexpected-4addr-frame event to hostapd. For older versions,
* also drop the frame to cooked monitor interfaces.
@@ -2474,14 +2487,14 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
rx->skb->dev = dev;
- if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 &&
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) &&
+ local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 &&
!is_multicast_ether_addr(
((struct ethhdr *)rx->skb->data)->h_dest) &&
(!local->scanning &&
- !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) {
- mod_timer(&local->dynamic_ps_timer, jiffies +
- msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
- }
+ !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)))
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
ieee80211_deliver_skb(rx);
@@ -2828,7 +2841,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
switch (mgmt->u.action.u.measurement.action_code) {
case WLAN_ACTION_SPCT_MSR_REQ:
- if (status->band != IEEE80211_BAND_5GHZ)
+ if (status->band != NL80211_BAND_5GHZ)
break;
if (len < (IEEE80211_MIN_ACTION_SIZE +
@@ -3201,7 +3214,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
res = rxh(rx); \
if (res != RX_CONTINUE) \
goto rxh_next; \
- } while (0);
+ } while (0)
/* Lock here to avoid hitting all of the data used in the RX
* path (e.g. key data, station data, ...) concurrently when
@@ -3219,30 +3232,30 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
*/
rx->skb = skb;
- CALL_RXH(ieee80211_rx_h_check_more_data)
- CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll)
- CALL_RXH(ieee80211_rx_h_sta_process)
- CALL_RXH(ieee80211_rx_h_decrypt)
- CALL_RXH(ieee80211_rx_h_defragment)
- CALL_RXH(ieee80211_rx_h_michael_mic_verify)
+ CALL_RXH(ieee80211_rx_h_check_more_data);
+ CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll);
+ CALL_RXH(ieee80211_rx_h_sta_process);
+ CALL_RXH(ieee80211_rx_h_decrypt);
+ CALL_RXH(ieee80211_rx_h_defragment);
+ CALL_RXH(ieee80211_rx_h_michael_mic_verify);
/* must be after MMIC verify so header is counted in MPDU mic */
#ifdef CONFIG_MAC80211_MESH
if (ieee80211_vif_is_mesh(&rx->sdata->vif))
CALL_RXH(ieee80211_rx_h_mesh_fwding);
#endif
- CALL_RXH(ieee80211_rx_h_amsdu)
- CALL_RXH(ieee80211_rx_h_data)
+ CALL_RXH(ieee80211_rx_h_amsdu);
+ CALL_RXH(ieee80211_rx_h_data);
/* special treatment -- needs the queue */
res = ieee80211_rx_h_ctrl(rx, frames);
if (res != RX_CONTINUE)
goto rxh_next;
- CALL_RXH(ieee80211_rx_h_mgmt_check)
- CALL_RXH(ieee80211_rx_h_action)
- CALL_RXH(ieee80211_rx_h_userspace_mgmt)
- CALL_RXH(ieee80211_rx_h_action_return)
- CALL_RXH(ieee80211_rx_h_mgmt)
+ CALL_RXH(ieee80211_rx_h_mgmt_check);
+ CALL_RXH(ieee80211_rx_h_action);
+ CALL_RXH(ieee80211_rx_h_userspace_mgmt);
+ CALL_RXH(ieee80211_rx_h_action_return);
+ CALL_RXH(ieee80211_rx_h_mgmt);
rxh_next:
ieee80211_rx_handlers_result(rx, res);
@@ -3265,10 +3278,10 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
res = rxh(rx); \
if (res != RX_CONTINUE) \
goto rxh_next; \
- } while (0);
+ } while (0)
- CALL_RXH(ieee80211_rx_h_check_dup)
- CALL_RXH(ieee80211_rx_h_check)
+ CALL_RXH(ieee80211_rx_h_check_dup);
+ CALL_RXH(ieee80211_rx_h_check);
ieee80211_rx_reorder_ampdu(rx, &reorder_release);
@@ -3513,6 +3526,351 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
return false;
}
+void ieee80211_check_fast_rx(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_key *key;
+ struct ieee80211_fast_rx fastrx = {
+ .dev = sdata->dev,
+ .vif_type = sdata->vif.type,
+ .control_port_protocol = sdata->control_port_protocol,
+ }, *old, *new = NULL;
+ bool assign = false;
+
+ /* use sparse to check that we don't return without updating */
+ __acquire(check_fast_rx);
+
+ BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != sizeof(rfc1042_header));
+ BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != ETH_ALEN);
+ ether_addr_copy(fastrx.rfc1042_hdr, rfc1042_header);
+ ether_addr_copy(fastrx.vif_addr, sdata->vif.addr);
+
+ fastrx.uses_rss = ieee80211_hw_check(&local->hw, USES_RSS);
+
+ /* fast-rx doesn't do reordering */
+ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) &&
+ !ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER))
+ goto clear;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ /* 4-addr is harder to deal with, later maybe */
+ if (sdata->u.mgd.use_4addr)
+ goto clear;
+ /* software powersave is a huge mess, avoid all of it */
+ if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
+ goto clear;
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
+ !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
+ goto clear;
+ if (sta->sta.tdls) {
+ fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
+ fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+ fastrx.expected_ds_bits = 0;
+ } else {
+ fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0;
+ fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
+ fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3);
+ fastrx.expected_ds_bits =
+ cpu_to_le16(IEEE80211_FCTL_FROMDS);
+ }
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_AP:
+ /* parallel-rx requires this, at least with calls to
+ * ieee80211_sta_ps_transition()
+ */
+ if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
+ goto clear;
+ fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
+ fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+ fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_TODS);
+
+ fastrx.internal_forward =
+ !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
+ (sdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
+ !sdata->u.vlan.sta);
+ break;
+ default:
+ goto clear;
+ }
+
+ if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ goto clear;
+
+ rcu_read_lock();
+ key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+ if (key) {
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ /* we don't want to deal with MMIC in fast-rx */
+ goto clear_rcu;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ break;
+ default:
+ /* we also don't want to deal with WEP or cipher scheme
+ * since those require looking up the key idx in the
+ * frame, rather than assuming the PTK is used
+ * (we need to revisit this once we implement the real
+ * PTK index, which is now valid in the spec, but we
+ * haven't implemented that part yet)
+ */
+ goto clear_rcu;
+ }
+
+ fastrx.key = true;
+ fastrx.icv_len = key->conf.icv_len;
+ }
+
+ assign = true;
+ clear_rcu:
+ rcu_read_unlock();
+ clear:
+ __release(check_fast_rx);
+
+ if (assign)
+ new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL);
+
+ spin_lock_bh(&sta->lock);
+ old = rcu_dereference_protected(sta->fast_rx, true);
+ rcu_assign_pointer(sta->fast_rx, new);
+ spin_unlock_bh(&sta->lock);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+void ieee80211_clear_fast_rx(struct sta_info *sta)
+{
+ struct ieee80211_fast_rx *old;
+
+ spin_lock_bh(&sta->lock);
+ old = rcu_dereference_protected(sta->fast_rx, true);
+ RCU_INIT_POINTER(sta->fast_rx, NULL);
+ spin_unlock_bh(&sta->lock);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ lockdep_assert_held(&local->sta_mtx);
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata &&
+ (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
+ continue;
+ ieee80211_check_fast_rx(sta);
+ }
+}
+
+void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ mutex_lock(&local->sta_mtx);
+ __ieee80211_check_fast_rx_iface(sdata);
+ mutex_unlock(&local->sta_mtx);
+}
+
+static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
+ struct ieee80211_fast_rx *fast_rx)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct sta_info *sta = rx->sta;
+ int orig_len = skb->len;
+ int snap_offs = ieee80211_hdrlen(hdr->frame_control);
+ struct {
+ u8 snap[sizeof(rfc1042_header)];
+ __be16 proto;
+ } *payload __aligned(2);
+ struct {
+ u8 da[ETH_ALEN];
+ u8 sa[ETH_ALEN];
+ } addrs __aligned(2);
+ struct ieee80211_sta_rx_stats *stats = &sta->rx_stats;
+
+ if (fast_rx->uses_rss)
+ stats = this_cpu_ptr(sta->pcpu_rx_stats);
+
+ /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write
+ * to a common data structure; drivers can implement that per queue
+ * but we don't have that information in mac80211
+ */
+ if (!(status->flag & RX_FLAG_DUP_VALIDATED))
+ return false;
+
+#define FAST_RX_CRYPT_FLAGS (RX_FLAG_PN_VALIDATED | RX_FLAG_DECRYPTED)
+
+ /* If using encryption, we also need to have:
+ * - PN_VALIDATED: similar, but the implementation is tricky
+ * - DECRYPTED: necessary for PN_VALIDATED
+ */
+ if (fast_rx->key &&
+ (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS)
+ return false;
+
+ /* we don't deal with A-MSDU deaggregation here */
+ if (status->rx_flags & IEEE80211_RX_AMSDU)
+ return false;
+
+ if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+ return false;
+
+ if (unlikely(ieee80211_is_frag(hdr)))
+ return false;
+
+ /* Since our interface address cannot be multicast, this
+ * implicitly also rejects multicast frames without the
+ * explicit check.
+ *
+ * We shouldn't get any *data* frames not addressed to us
+ * (AP mode will accept multicast *management* frames), but
+ * punting here will make it go through the full checks in
+ * ieee80211_accept_frame().
+ */
+ if (!ether_addr_equal(fast_rx->vif_addr, hdr->addr1))
+ return false;
+
+ if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FROMDS |
+ IEEE80211_FCTL_TODS)) !=
+ fast_rx->expected_ds_bits)
+ goto drop;
+
+ /* assign the key to drop unencrypted frames (later)
+ * and strip the IV/MIC if necessary
+ */
+ if (fast_rx->key && !(status->flag & RX_FLAG_IV_STRIPPED)) {
+ /* GCMP header length is the same */
+ snap_offs += IEEE80211_CCMP_HDR_LEN;
+ }
+
+ if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
+ goto drop;
+ payload = (void *)(skb->data + snap_offs);
+
+ if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr))
+ return false;
+
+ /* Don't handle these here since they require special code.
+ * Accept AARP and IPX even though they should come with a
+ * bridge-tunnel header - but if we get them this way then
+ * there's little point in discarding them.
+ */
+ if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) ||
+ payload->proto == fast_rx->control_port_protocol))
+ return false;
+
+ /* after this point, don't punt to the slowpath! */
+
+ if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) &&
+ pskb_trim(skb, skb->len - fast_rx->icv_len))
+ goto drop;
+
+ if (unlikely(fast_rx->sta_notify)) {
+ ieee80211_sta_rx_notify(rx->sdata, hdr);
+ fast_rx->sta_notify = false;
+ }
+
+ /* statistics part of ieee80211_rx_h_sta_process() */
+ stats->last_rx = jiffies;
+ stats->last_rate = sta_stats_encode_rate(status);
+
+ stats->fragments++;
+
+ if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
+ stats->last_signal = status->signal;
+ if (!fast_rx->uses_rss)
+ ewma_signal_add(&sta->rx_stats_avg.signal,
+ -status->signal);
+ }
+
+ if (status->chains) {
+ int i;
+
+ stats->chains = status->chains;
+ for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
+ int signal = status->chain_signal[i];
+
+ if (!(status->chains & BIT(i)))
+ continue;
+
+ stats->chain_signal_last[i] = signal;
+ if (!fast_rx->uses_rss)
+ ewma_signal_add(&sta->rx_stats_avg.chain_signal[i],
+ -signal);
+ }
+ }
+ /* end of statistics */
+
+ if (rx->key && !ieee80211_has_protected(hdr->frame_control))
+ goto drop;
+
+ /* do the header conversion - first grab the addresses */
+ ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs);
+ ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs);
+ /* remove the SNAP but leave the ethertype */
+ skb_pull(skb, snap_offs + sizeof(rfc1042_header));
+ /* push the addresses in front */
+ memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs));
+
+ skb->dev = fast_rx->dev;
+
+ ieee80211_rx_stats(fast_rx->dev, skb->len);
+
+ /* The seqno index has the same property as needed
+ * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
+ * for non-QoS-data frames. Here we know it's a data
+ * frame, so count MSDUs.
+ */
+ u64_stats_update_begin(&stats->syncp);
+ stats->msdu[rx->seqno_idx]++;
+ stats->bytes += orig_len;
+ u64_stats_update_end(&stats->syncp);
+
+ if (fast_rx->internal_forward) {
+ struct sta_info *dsta = sta_info_get(rx->sdata, skb->data);
+
+ if (dsta) {
+ /*
+ * Send to wireless media and increase priority by 256
+ * to keep the received priority instead of
+ * reclassifying the frame (see cfg80211_classify8021d).
+ */
+ skb->priority += 256;
+ skb->protocol = htons(ETH_P_802_3);
+ skb_reset_network_header(skb);
+ skb_reset_mac_header(skb);
+ dev_queue_xmit(skb);
+ return true;
+ }
+ }
+
+ /* deliver to local stack */
+ skb->protocol = eth_type_trans(skb, fast_rx->dev);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ if (rx->napi)
+ napi_gro_receive(rx->napi, skb);
+ else
+ netif_receive_skb(skb);
+
+ return true;
+ drop:
+ dev_kfree_skb(skb);
+ stats->dropped++;
+ return true;
+}
+
/*
* This function returns whether or not the SKB
* was destined for RX processing or not, which,
@@ -3527,6 +3885,21 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
rx->skb = skb;
+ /* See if we can do fast-rx; if we have to copy we already lost,
+ * so punt in that case. We should never have to deliver a data
+ * frame to multiple interfaces anyway.
+ *
+ * We skip the ieee80211_accept_frame() call and do the necessary
+ * checking inside ieee80211_invoke_fast_rx().
+ */
+ if (consume && rx->sta) {
+ struct ieee80211_fast_rx *fast_rx;
+
+ fast_rx = rcu_dereference(rx->sta->fast_rx);
+ if (fast_rx && ieee80211_invoke_fast_rx(rx, fast_rx))
+ return true;
+ }
+
if (!ieee80211_accept_frame(rx))
return false;
@@ -3552,6 +3925,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
* be called with rcu_read_lock protection.
*/
static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
+ struct ieee80211_sta *pubsta,
struct sk_buff *skb,
struct napi_struct *napi)
{
@@ -3561,7 +3935,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
- struct sta_info *sta, *prev_sta;
struct rhash_head *tmp;
int err = 0;
@@ -3597,7 +3970,14 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
ieee80211_is_beacon(hdr->frame_control)))
ieee80211_scan_rx(local, skb);
- if (ieee80211_is_data(fc)) {
+ if (pubsta) {
+ rx.sta = container_of(pubsta, struct sta_info, sta);
+ rx.sdata = rx.sta->sdata;
+ if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+ return;
+ goto out;
+ } else if (ieee80211_is_data(fc)) {
+ struct sta_info *sta, *prev_sta;
const struct bucket_table *tbl;
prev_sta = NULL;
@@ -3671,8 +4051,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
* This is the receive path handler. It is called by a low level driver when an
* 802.11 MPDU is received from the hardware.
*/
-void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
- struct napi_struct *napi)
+void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
+ struct sk_buff *skb, struct napi_struct *napi)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_rate *rate = NULL;
@@ -3681,7 +4061,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
WARN_ON_ONCE(softirq_count() == 0);
- if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
+ if (WARN_ON(status->band >= NUM_NL80211_BANDS))
goto drop;
sband = local->hw.wiphy->bands[status->band];
@@ -3771,7 +4151,8 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
ieee80211_tpt_led_trig_rx(local,
((struct ieee80211_hdr *)skb->data)->frame_control,
skb->len);
- __ieee80211_rx_handle_packet(hw, skb, napi);
+
+ __ieee80211_rx_handle_packet(hw, pubsta, skb, napi);
rcu_read_unlock();
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index ae980ce8d..f9648ef9e 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -66,7 +66,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
struct cfg80211_bss *cbss;
struct ieee80211_bss *bss;
int clen, srlen;
- struct cfg80211_inform_bss bss_meta = {};
+ struct cfg80211_inform_bss bss_meta = {
+ .boottime_ns = rx_status->boottime_ns,
+ };
bool signal_valid;
if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
@@ -270,7 +272,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
n_chans = req->n_channels;
} else {
do {
- if (local->hw_scan_band == IEEE80211_NUM_BANDS)
+ if (local->hw_scan_band == NUM_NL80211_BANDS)
return false;
n_chans = 0;
@@ -303,6 +305,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr);
ether_addr_copy(local->hw_scan_req->req.mac_addr_mask,
req->mac_addr_mask);
+ ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid);
return true;
}
@@ -482,7 +485,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
int i;
struct ieee80211_sub_if_data *sdata;
struct cfg80211_scan_request *scan_req;
- enum ieee80211_band band = local->hw.conf.chandef.chan->band;
+ enum nl80211_band band = local->hw.conf.chandef.chan->band;
u32 tx_flags;
scan_req = rcu_dereference_protected(local->scan_req,
@@ -497,7 +500,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
for (i = 0; i < scan_req->n_ssids; i++)
ieee80211_send_probe_req(
- sdata, local->scan_addr, NULL,
+ sdata, local->scan_addr, scan_req->bssid,
scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len,
scan_req->ie, scan_req->ie_len,
scan_req->rates[band], false,
@@ -562,6 +565,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
req->n_channels * sizeof(req->channels[0]);
local->hw_scan_req->req.ie = ies;
local->hw_scan_req->req.flags = req->flags;
+ eth_broadcast_addr(local->hw_scan_req->req.bssid);
local->hw_scan_band = 0;
@@ -949,7 +953,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_local *local = sdata->local;
int ret = -EBUSY, i, n_ch = 0;
- enum ieee80211_band band;
+ enum nl80211_band band;
mutex_lock(&local->mtx);
@@ -961,7 +965,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
if (!channels) {
int max_n;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
if (!local->hw.wiphy->bands[band])
continue;
@@ -1081,7 +1085,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
struct ieee80211_scan_ies sched_scan_ies = {};
struct cfg80211_chan_def chandef;
int ret, i, iebufsz, num_bands = 0;
- u32 rate_masks[IEEE80211_NUM_BANDS] = {};
+ u32 rate_masks[NUM_NL80211_BANDS] = {};
u8 bands_used = 0;
u8 *ie;
size_t len;
@@ -1093,7 +1097,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
if (!local->ops->sched_scan_start)
return -ENOTSUPP;
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
if (local->hw.wiphy->bands[i]) {
bands_used |= BIT(i);
rate_masks[i] = (u32) -1;
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 06e6ac8cc..2ddc661f0 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -23,11 +23,11 @@
int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
- enum ieee80211_band current_band,
+ enum nl80211_band current_band,
u32 sta_flags, u8 *bssid,
struct ieee80211_csa_ie *csa_ie)
{
- enum ieee80211_band new_band;
+ enum nl80211_band new_band;
int new_freq;
u8 new_chan_no;
struct ieee80211_channel *new_chan;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 861b93ffb..5ccfdbd40 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2,7 +2,7 @@
* Copyright 2002-2005, Instant802 Networks, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -255,6 +255,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
#ifdef CONFIG_MAC80211_MESH
kfree(sta->mesh);
#endif
+ free_percpu(sta->pcpu_rx_stats);
kfree(sta);
}
@@ -312,6 +313,13 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
if (!sta)
return NULL;
+ if (ieee80211_hw_check(hw, USES_RSS)) {
+ sta->pcpu_rx_stats =
+ alloc_percpu(struct ieee80211_sta_rx_stats);
+ if (!sta->pcpu_rx_stats)
+ goto free;
+ }
+
spin_lock_init(&sta->lock);
spin_lock_init(&sta->ps_lock);
INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames);
@@ -336,15 +344,17 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
sta->sdata = sdata;
sta->rx_stats.last_rx = jiffies;
+ u64_stats_init(&sta->rx_stats.syncp);
+
sta->sta_state = IEEE80211_STA_NONE;
/* Mark TID as unreserved */
sta->reserved_tid = IEEE80211_TID_UNRESERVED;
sta->last_connected = ktime_get_seconds();
- ewma_signal_init(&sta->rx_stats.avg_signal);
- for (i = 0; i < ARRAY_SIZE(sta->rx_stats.chain_signal_avg); i++)
- ewma_signal_init(&sta->rx_stats.chain_signal_avg[i]);
+ ewma_signal_init(&sta->rx_stats_avg.signal);
+ for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++)
+ ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]);
if (local->ops->wake_tx_queue) {
void *txq_data;
@@ -407,6 +417,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
}
}
+ sta->sta.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA;
+
sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
return sta;
@@ -879,6 +891,13 @@ static int __must_check __sta_info_destroy_part1(struct sta_info *sta)
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA);
+ /*
+ * Before removing the station from the driver there might be pending
+ * rx frames on RSS queues sent prior to the disassociation - wait for
+ * all such frames to be processed.
+ */
+ drv_sync_rx_queues(local, sta);
+
ret = sta_info_hash_del(local, sta);
if (WARN_ON(ret))
return ret;
@@ -1091,10 +1110,12 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
mutex_lock(&local->sta_mtx);
list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+ unsigned long last_active = ieee80211_sta_last_active(sta);
+
if (sdata != sta->sdata)
continue;
- if (time_after(jiffies, sta->rx_stats.last_rx + exp_time)) {
+ if (time_is_before_jiffies(last_active + exp_time)) {
sta_dbg(sta->sdata, "expiring inactive STA %pM\n",
sta->sta.addr);
@@ -1764,6 +1785,31 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta,
}
EXPORT_SYMBOL(ieee80211_sta_set_buffered);
+static void
+ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ bool allow_p2p_go_ps = sdata->vif.p2p;
+ struct sta_info *sta;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata ||
+ !test_sta_flag(sta, WLAN_STA_ASSOC))
+ continue;
+ if (!sta->sta.support_p2p_ps) {
+ allow_p2p_go_ps = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) {
+ sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps;
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS);
+ }
+}
+
int sta_info_move_state(struct sta_info *sta,
enum ieee80211_sta_state new_state)
{
@@ -1825,12 +1871,16 @@ int sta_info_move_state(struct sta_info *sta,
} else if (sta->sta_state == IEEE80211_STA_ASSOC) {
clear_bit(WLAN_STA_ASSOC, &sta->_flags);
ieee80211_recalc_min_chandef(sta->sdata);
+ if (!sta->sta.support_p2p_ps)
+ ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
}
break;
case IEEE80211_STA_ASSOC:
if (sta->sta_state == IEEE80211_STA_AUTH) {
set_bit(WLAN_STA_ASSOC, &sta->_flags);
ieee80211_recalc_min_chandef(sta->sdata);
+ if (!sta->sta.support_p2p_ps)
+ ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
} else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
(sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -1838,6 +1888,7 @@ int sta_info_move_state(struct sta_info *sta,
atomic_dec(&sta->sdata->bss->num_mcast_sta);
clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
ieee80211_clear_fast_xmit(sta);
+ ieee80211_clear_fast_rx(sta);
}
break;
case IEEE80211_STA_AUTHORIZED:
@@ -1848,6 +1899,7 @@ int sta_info_move_state(struct sta_info *sta,
atomic_inc(&sta->sdata->bss->num_mcast_sta);
set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
ieee80211_check_fast_xmit(sta);
+ ieee80211_check_fast_rx(sta);
}
break;
default:
@@ -1894,43 +1946,117 @@ u8 sta_info_tx_streams(struct sta_info *sta)
>> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1;
}
-static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+static struct ieee80211_sta_rx_stats *
+sta_get_last_rx_stats(struct sta_info *sta)
{
- rinfo->flags = 0;
-
- if (sta->rx_stats.last_rate_flag & RX_FLAG_HT) {
- rinfo->flags |= RATE_INFO_FLAGS_MCS;
- rinfo->mcs = sta->rx_stats.last_rate_idx;
- } else if (sta->rx_stats.last_rate_flag & RX_FLAG_VHT) {
- rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS;
- rinfo->nss = sta->rx_stats.last_rate_vht_nss;
- rinfo->mcs = sta->rx_stats.last_rate_idx;
- } else {
+ struct ieee80211_sta_rx_stats *stats = &sta->rx_stats;
+ struct ieee80211_local *local = sta->local;
+ int cpu;
+
+ if (!ieee80211_hw_check(&local->hw, USES_RSS))
+ return stats;
+
+ for_each_possible_cpu(cpu) {
+ struct ieee80211_sta_rx_stats *cpustats;
+
+ cpustats = per_cpu_ptr(sta->pcpu_rx_stats, cpu);
+
+ if (time_after(cpustats->last_rx, stats->last_rx))
+ stats = cpustats;
+ }
+
+ return stats;
+}
+
+static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
+ struct rate_info *rinfo)
+{
+ rinfo->bw = (rate & STA_STATS_RATE_BW_MASK) >>
+ STA_STATS_RATE_BW_SHIFT;
+
+ if (rate & STA_STATS_RATE_VHT) {
+ rinfo->flags = RATE_INFO_FLAGS_VHT_MCS;
+ rinfo->mcs = rate & 0xf;
+ rinfo->nss = (rate & 0xf0) >> 4;
+ } else if (rate & STA_STATS_RATE_HT) {
+ rinfo->flags = RATE_INFO_FLAGS_MCS;
+ rinfo->mcs = rate & 0xff;
+ } else if (rate & STA_STATS_RATE_LEGACY) {
struct ieee80211_supported_band *sband;
- int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
u16 brate;
-
- sband = sta->local->hw.wiphy->bands[
- ieee80211_get_sdata_band(sta->sdata)];
- brate = sband->bitrates[sta->rx_stats.last_rate_idx].bitrate;
+ unsigned int shift;
+
+ sband = local->hw.wiphy->bands[(rate >> 4) & 0xf];
+ brate = sband->bitrates[rate & 0xf].bitrate;
+ if (rinfo->bw == RATE_INFO_BW_5)
+ shift = 2;
+ else if (rinfo->bw == RATE_INFO_BW_10)
+ shift = 1;
+ else
+ shift = 0;
rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
}
- if (sta->rx_stats.last_rate_flag & RX_FLAG_SHORT_GI)
+ if (rate & STA_STATS_RATE_SGI)
rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
+}
- if (sta->rx_stats.last_rate_flag & RX_FLAG_5MHZ)
- rinfo->bw = RATE_INFO_BW_5;
- else if (sta->rx_stats.last_rate_flag & RX_FLAG_10MHZ)
- rinfo->bw = RATE_INFO_BW_10;
- else if (sta->rx_stats.last_rate_flag & RX_FLAG_40MHZ)
- rinfo->bw = RATE_INFO_BW_40;
- else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_80MHZ)
- rinfo->bw = RATE_INFO_BW_80;
- else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_160MHZ)
- rinfo->bw = RATE_INFO_BW_160;
+static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+{
+ u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate);
+
+ if (rate == STA_STATS_RATE_INVALID)
+ rinfo->flags = 0;
else
- rinfo->bw = RATE_INFO_BW_20;
+ sta_stats_decode_rate(sta->local, rate, rinfo);
+}
+
+static void sta_set_tidstats(struct sta_info *sta,
+ struct cfg80211_tid_stats *tidstats,
+ int tid)
+{
+ struct ieee80211_local *local = sta->local;
+
+ if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin(&sta->rx_stats.syncp);
+ tidstats->rx_msdu = sta->rx_stats.msdu[tid];
+ } while (u64_stats_fetch_retry(&sta->rx_stats.syncp, start));
+
+ tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU);
+ }
+
+ if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) {
+ tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU);
+ tidstats->tx_msdu = sta->tx_stats.msdu[tid];
+ }
+
+ if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) &&
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
+ tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES);
+ tidstats->tx_msdu_retries = sta->status_stats.msdu_retries[tid];
+ }
+
+ if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) &&
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
+ tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
+ tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid];
+ }
+}
+
+static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
+{
+ unsigned int start;
+ u64 value;
+
+ do {
+ start = u64_stats_fetch_begin(&rxstats->syncp);
+ value = rxstats->bytes;
+ } while (u64_stats_fetch_retry(&rxstats->syncp, start));
+
+ return value;
}
void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
@@ -1939,7 +2065,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
struct ieee80211_local *local = sdata->local;
struct rate_control_ref *ref = NULL;
u32 thr = 0;
- int i, ac;
+ int i, ac, cpu;
+ struct ieee80211_sta_rx_stats *last_rxstats;
+
+ last_rxstats = sta_get_last_rx_stats(sta);
if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
ref = local->rate_ctrl;
@@ -1968,7 +2097,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
sinfo->inactive_time =
- jiffies_to_msecs(jiffies - sta->rx_stats.last_rx);
+ jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));
if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) |
BIT(NL80211_STA_INFO_TX_BYTES)))) {
@@ -1987,12 +2116,30 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) |
BIT(NL80211_STA_INFO_RX_BYTES)))) {
- sinfo->rx_bytes = sta->rx_stats.bytes;
+ sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats);
+
+ if (sta->pcpu_rx_stats) {
+ for_each_possible_cpu(cpu) {
+ struct ieee80211_sta_rx_stats *cpurxs;
+
+ cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu);
+ sinfo->rx_bytes += sta_get_stats_bytes(cpurxs);
+ }
+ }
+
sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64);
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) {
sinfo->rx_packets = sta->rx_stats.packets;
+ if (sta->pcpu_rx_stats) {
+ for_each_possible_cpu(cpu) {
+ struct ieee80211_sta_rx_stats *cpurxs;
+
+ cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu);
+ sinfo->rx_packets += cpurxs->packets;
+ }
+ }
sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS);
}
@@ -2007,6 +2154,14 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
}
sinfo->rx_dropped_misc = sta->rx_stats.dropped;
+ if (sta->pcpu_rx_stats) {
+ for_each_possible_cpu(cpu) {
+ struct ieee80211_sta_rx_stats *cpurxs;
+
+ cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu);
+ sinfo->rx_packets += cpurxs->dropped;
+ }
+ }
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
!(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
@@ -2018,29 +2173,36 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) {
- sinfo->signal = (s8)sta->rx_stats.last_signal;
+ sinfo->signal = (s8)last_rxstats->last_signal;
sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) {
+ if (!sta->pcpu_rx_stats &&
+ !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) {
sinfo->signal_avg =
- -ewma_signal_read(&sta->rx_stats.avg_signal);
+ -ewma_signal_read(&sta->rx_stats_avg.signal);
sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG);
}
}
- if (sta->rx_stats.chains &&
+ /* for the average - if pcpu_rx_stats isn't set - rxstats must point to
+ * the sta->rx_stats struct, so the check here is fine with and without
+ * pcpu statistics
+ */
+ if (last_rxstats->chains &&
!(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
- sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
- BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
+ sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL);
+ if (!sta->pcpu_rx_stats)
+ sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
+
+ sinfo->chains = last_rxstats->chains;
- sinfo->chains = sta->rx_stats.chains;
for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) {
sinfo->chain_signal[i] =
- sta->rx_stats.chain_signal_last[i];
+ last_rxstats->chain_signal_last[i];
sinfo->chain_signal_avg[i] =
- -ewma_signal_read(&sta->rx_stats.chain_signal_avg[i]);
+ -ewma_signal_read(&sta->rx_stats_avg.chain_signal[i]);
}
}
@@ -2059,33 +2221,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
- if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
- tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU);
- tidstats->rx_msdu = sta->rx_stats.msdu[i];
- }
-
- if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) {
- tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU);
- tidstats->tx_msdu = sta->tx_stats.msdu[i];
- }
-
- if (!(tidstats->filled &
- BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) &&
- ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- tidstats->filled |=
- BIT(NL80211_TID_STATS_TX_MSDU_RETRIES);
- tidstats->tx_msdu_retries =
- sta->status_stats.msdu_retries[i];
- }
-
- if (!(tidstats->filled &
- BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) &&
- ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- tidstats->filled |=
- BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
- tidstats->tx_msdu_failed =
- sta->status_stats.msdu_failed[i];
- }
+ sta_set_tidstats(sta, tidstats, i);
}
if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -2154,3 +2290,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
sinfo->expected_throughput = thr;
}
}
+
+unsigned long ieee80211_sta_last_active(struct sta_info *sta)
+{
+ struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta);
+
+ if (time_after(stats->last_rx, sta->status_stats.last_ack))
+ return stats->last_rx;
+ return sta->status_stats.last_ack;
+}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index ba7ce53ec..78b0ef32d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -1,7 +1,7 @@
/*
* Copyright 2002-2005, Devicescape Software, Inc.
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2015-2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@
#include <linux/average.h>
#include <linux/etherdevice.h>
#include <linux/rhashtable.h>
+#include <linux/u64_stats_sync.h>
#include "key.h"
/**
@@ -69,6 +70,8 @@
* @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP.
* @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX
* until pending frames are delivered
+ *
+ * @NUM_WLAN_STA_FLAGS: number of defined flags
*/
enum ieee80211_sta_info_flags {
WLAN_STA_AUTH,
@@ -97,6 +100,8 @@ enum ieee80211_sta_info_flags {
WLAN_STA_MPSP_OWNER,
WLAN_STA_MPSP_RECIPIENT,
WLAN_STA_PS_DELIVER,
+
+ NUM_WLAN_STA_FLAGS,
};
#define ADDBA_RESP_INTERVAL HZ
@@ -281,6 +286,40 @@ struct ieee80211_fast_tx {
};
/**
+ * struct ieee80211_fast_rx - RX fastpath information
+ * @dev: netdevice for reporting the SKB
+ * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type)
+ * @vif_addr: interface address
+ * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache)
+ * @control_port_protocol: control port protocol copied from sdata
+ * @expected_ds_bits: from/to DS bits expected
+ * @icv_len: length of the MIC if present
+ * @key: bool indicating encryption is expected (key is set)
+ * @sta_notify: notify the MLME code (once)
+ * @internal_forward: forward froms internally on AP/VLAN type interfaces
+ * @uses_rss: copy of USES_RSS hw flag
+ * @da_offs: offset of the DA in the header (for header conversion)
+ * @sa_offs: offset of the SA in the header (for header conversion)
+ * @rcu_head: RCU head for freeing this structure
+ */
+struct ieee80211_fast_rx {
+ struct net_device *dev;
+ enum nl80211_iftype vif_type;
+ u8 vif_addr[ETH_ALEN] __aligned(2);
+ u8 rfc1042_hdr[6] __aligned(2);
+ __be16 control_port_protocol;
+ __le16 expected_ds_bits;
+ u8 icv_len;
+ u8 key:1,
+ sta_notify:1,
+ internal_forward:1,
+ uses_rss:1;
+ u8 da_offs, sa_offs;
+
+ struct rcu_head rcu_head;
+};
+
+/**
* struct mesh_sta - mesh STA information
* @plink_lock: serialize access to plink fields
* @llid: Local link ID
@@ -330,6 +369,21 @@ struct mesh_sta {
DECLARE_EWMA(signal, 1024, 8)
+struct ieee80211_sta_rx_stats {
+ unsigned long packets;
+ unsigned long last_rx;
+ unsigned long num_duplicates;
+ unsigned long fragments;
+ unsigned long dropped;
+ int last_signal;
+ u8 chains;
+ s8 chain_signal_last[IEEE80211_MAX_CHAINS];
+ u16 last_rate;
+ struct u64_stats_sync syncp;
+ u64 bytes;
+ u64 msdu[IEEE80211_NUM_TIDS + 1];
+};
+
/**
* struct sta_info - STA information
*
@@ -371,7 +425,7 @@ DECLARE_EWMA(signal, 1024, 8)
* @ampdu_mlme: A-MPDU state machine state
* @timer_to_tid: identity mapping to ID timers
* @mesh: mesh STA information
- * @debugfs: debug filesystem info
+ * @debugfs_dir: debug filesystem directory dentry
* @dead: set to true when sta is unlinked
* @removed: set to true when sta is being removed from sta_list
* @uploaded: set to true when sta is uploaded to the driver
@@ -385,10 +439,13 @@ DECLARE_EWMA(signal, 1024, 8)
* @cipher_scheme: optional cipher scheme for this station
* @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
* @fast_tx: TX fastpath information
+ * @fast_rx: RX fastpath information
* @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
* the BSS one.
* @tx_stats: TX statistics
* @rx_stats: RX statistics
+ * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs
+ * this (by advertising the USES_RSS hw flag)
* @status_stats: TX status statistics
*/
struct sta_info {
@@ -408,6 +465,8 @@ struct sta_info {
spinlock_t lock;
struct ieee80211_fast_tx __rcu *fast_tx;
+ struct ieee80211_fast_rx __rcu *fast_rx;
+ struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats;
#ifdef CONFIG_MAC80211_MESH
struct mesh_sta *mesh;
@@ -437,24 +496,11 @@ struct sta_info {
long last_connected;
/* Updated from RX path only, no locking requirements */
+ struct ieee80211_sta_rx_stats rx_stats;
struct {
- unsigned long packets;
- u64 bytes;
- unsigned long last_rx;
- unsigned long num_duplicates;
- unsigned long fragments;
- unsigned long dropped;
- int last_signal;
- struct ewma_signal avg_signal;
- u8 chains;
- s8 chain_signal_last[IEEE80211_MAX_CHAINS];
- struct ewma_signal chain_signal_avg[IEEE80211_MAX_CHAINS];
- int last_rate_idx;
- u32 last_rate_flag;
- u32 last_rate_vht_flag;
- u8 last_rate_vht_nss;
- u64 msdu[IEEE80211_NUM_TIDS + 1];
- } rx_stats;
+ struct ewma_signal signal;
+ struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS];
+ } rx_stats_avg;
/* Plus 1 for non-QoS frames */
__le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1];
@@ -467,6 +513,7 @@ struct sta_info {
unsigned long last_tdls_pkt_time;
u64 msdu_retries[IEEE80211_NUM_TIDS + 1];
u64 msdu_failed[IEEE80211_NUM_TIDS + 1];
+ unsigned long last_ack;
} status_stats;
/* Updated from TX path only, no locking requirements */
@@ -485,10 +532,7 @@ struct sta_info {
u8 timer_to_tid[IEEE80211_NUM_TIDS];
#ifdef CONFIG_MAC80211_DEBUGFS
- struct sta_info_debugfsdentries {
- struct dentry *dir;
- bool add_has_run;
- } debugfs;
+ struct dentry *debugfs_dir;
#endif
enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
@@ -676,4 +720,44 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta);
+unsigned long ieee80211_sta_last_active(struct sta_info *sta);
+
+#define STA_STATS_RATE_INVALID 0
+#define STA_STATS_RATE_VHT 0x8000
+#define STA_STATS_RATE_HT 0x4000
+#define STA_STATS_RATE_LEGACY 0x2000
+#define STA_STATS_RATE_SGI 0x1000
+#define STA_STATS_RATE_BW_SHIFT 9
+#define STA_STATS_RATE_BW_MASK (0x7 << STA_STATS_RATE_BW_SHIFT)
+
+static inline u16 sta_stats_encode_rate(struct ieee80211_rx_status *s)
+{
+ u16 r = s->rate_idx;
+
+ if (s->vht_flag & RX_VHT_FLAG_80MHZ)
+ r |= RATE_INFO_BW_80 << STA_STATS_RATE_BW_SHIFT;
+ else if (s->vht_flag & RX_VHT_FLAG_160MHZ)
+ r |= RATE_INFO_BW_160 << STA_STATS_RATE_BW_SHIFT;
+ else if (s->flag & RX_FLAG_40MHZ)
+ r |= RATE_INFO_BW_40 << STA_STATS_RATE_BW_SHIFT;
+ else if (s->flag & RX_FLAG_10MHZ)
+ r |= RATE_INFO_BW_10 << STA_STATS_RATE_BW_SHIFT;
+ else if (s->flag & RX_FLAG_5MHZ)
+ r |= RATE_INFO_BW_5 << STA_STATS_RATE_BW_SHIFT;
+ else
+ r |= RATE_INFO_BW_20 << STA_STATS_RATE_BW_SHIFT;
+
+ if (s->flag & RX_FLAG_SHORT_GI)
+ r |= STA_STATS_RATE_SGI;
+
+ if (s->flag & RX_FLAG_VHT)
+ r |= STA_STATS_RATE_VHT | (s->vht_nss << 4);
+ else if (s->flag & RX_FLAG_HT)
+ r |= STA_STATS_RATE_HT;
+ else
+ r |= STA_STATS_RATE_LEGACY | (s->band << 4);
+
+ return r;
+}
+
#endif /* STA_INFO_H */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 8b1b2ea03..c6d5c724e 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -188,7 +188,7 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
struct ieee80211_sub_if_data *sdata = sta->sdata;
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
- sta->rx_stats.last_rx = jiffies;
+ sta->status_stats.last_ack = jiffies;
if (ieee80211_is_data_qos(mgmt->frame_control)) {
struct ieee80211_hdr *hdr = (void *) skb->data;
@@ -647,7 +647,7 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
sta->status_stats.retry_count += retry_count;
if (acked) {
- sta->rx_stats.last_rx = jiffies;
+ sta->status_stats.last_ack = jiffies;
if (sta->status_stats.lost_packets)
sta->status_stats.lost_packets = 0;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index a29ea813b..1c7d45a6d 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -47,7 +47,7 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata,
NL80211_FEATURE_TDLS_CHANNEL_SWITCH;
bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) &&
!ifmgd->tdls_wider_bw_prohibited;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
bool vht = sband && sband->vht_cap.vht_supported;
u8 *pos = (void *)skb_put(skb, 10);
@@ -184,7 +184,7 @@ static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata,
if (status_code != 0)
return 0;
- if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) {
+ if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_2GHZ) {
return WLAN_CAPABILITY_SHORT_SLOT_TIME |
WLAN_CAPABILITY_SHORT_PREAMBLE;
}
@@ -357,7 +357,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
u8 action_code, bool initiator,
const u8 *extra_ies, size_t extra_ies_len)
{
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
struct ieee80211_sta_ht_cap ht_cap;
@@ -544,7 +544,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata,
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
size_t offset = 0, noffset;
struct sta_info *sta, *ap_sta;
- enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ enum nl80211_band band = ieee80211_get_sdata_band(sdata);
u8 *pos;
mutex_lock(&local->sta_mtx);
@@ -611,7 +611,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata,
ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
/* only include VHT-operation if not on the 2.4GHz band */
- if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) {
+ if (band != NL80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) {
/*
* if both peers support WIDER_BW, we can expand the chandef to
* a wider compatible one, up to 80MHz
@@ -1773,7 +1773,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata,
u8 target_channel, oper_class;
bool local_initiator;
struct sta_info *sta;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_tdls_data *tf = (void *)skb->data;
struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
int baselen = offsetof(typeof(*tf), u.chan_switch_req.variable);
@@ -1805,10 +1805,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata,
if ((oper_class == 112 || oper_class == 2 || oper_class == 3 ||
oper_class == 4 || oper_class == 5 || oper_class == 6) &&
target_channel < 14)
- band = IEEE80211_BAND_5GHZ;
+ band = NL80211_BAND_5GHZ;
else
- band = target_channel < 14 ? IEEE80211_BAND_2GHZ :
- IEEE80211_BAND_5GHZ;
+ band = target_channel < 14 ? NL80211_BAND_2GHZ :
+ NL80211_BAND_5GHZ;
freq = ieee80211_channel_to_frequency(target_channel, band);
if (freq == 0) {
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 2b0a17ee9..77e4c53ba 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1,3 +1,8 @@
+/*
+* Portions of this file
+* Copyright(c) 2016 Intel Deutschland GmbH
+*/
+
#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __MAC80211_DRIVER_TRACE
@@ -396,7 +401,7 @@ TRACE_EVENT(drv_bss_info_changed,
__field(u32, sync_device_ts)
__field(u8, sync_dtim_count)
__field(u32, basic_rates)
- __array(int, mcast_rate, IEEE80211_NUM_BANDS)
+ __array(int, mcast_rate, NUM_NL80211_BANDS)
__field(u16, ht_operation_mode)
__field(s32, cqm_rssi_thold);
__field(s32, cqm_rssi_hyst);
@@ -899,6 +904,13 @@ DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove,
TP_ARGS(local, sdata, sta)
);
+DEFINE_EVENT(sta_event, drv_sync_rx_queues,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+ TP_ARGS(local, sdata, sta)
+);
+
DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update,
TP_PROTO(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
@@ -1253,8 +1265,8 @@ TRACE_EVENT(drv_set_bitrate_mask,
TP_fast_assign(
LOCAL_ASSIGN;
VIF_ASSIGN;
- __entry->legacy_2g = mask->control[IEEE80211_BAND_2GHZ].legacy;
- __entry->legacy_5g = mask->control[IEEE80211_BAND_5GHZ].legacy;
+ __entry->legacy_2g = mask->control[NL80211_BAND_2GHZ].legacy;
+ __entry->legacy_5g = mask->control[NL80211_BAND_5GHZ].legacy;
),
TP_printk(
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 21f660239..203044379 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -150,7 +150,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
rate = DIV_ROUND_UP(r->bitrate, 1 << shift);
switch (sband->band) {
- case IEEE80211_BAND_2GHZ: {
+ case NL80211_BAND_2GHZ: {
u32 flag;
if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
flag = IEEE80211_RATE_MANDATORY_G;
@@ -160,13 +160,13 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
mrate = r->bitrate;
break;
}
- case IEEE80211_BAND_5GHZ:
+ case NL80211_BAND_5GHZ:
if (r->flags & IEEE80211_RATE_MANDATORY_A)
mrate = r->bitrate;
break;
- case IEEE80211_BAND_60GHZ:
+ case NL80211_BAND_60GHZ:
/* TODO, for now fall through */
- case IEEE80211_NUM_BANDS:
+ case NUM_NL80211_BANDS:
WARN_ON(1);
break;
}
@@ -1329,6 +1329,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
out:
spin_unlock_bh(&txqi->queue.lock);
+ if (skb && skb_has_frag_list(skb) &&
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+ skb_linearize(skb);
+
return skb;
}
EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -1696,7 +1700,9 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
bool rate_found = false;
u8 rate_retries = 0;
u16 rate_flags = 0;
- u8 mcs_known, mcs_flags;
+ u8 mcs_known, mcs_flags, mcs_bw;
+ u16 vht_known;
+ u8 vht_mcs = 0, vht_nss = 0;
int i;
info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
@@ -1772,11 +1778,38 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
mcs_flags & IEEE80211_RADIOTAP_MCS_SGI)
rate_flags |= IEEE80211_TX_RC_SHORT_GI;
+ mcs_bw = mcs_flags & IEEE80211_RADIOTAP_MCS_BW_MASK;
if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW &&
- mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40)
+ mcs_bw == IEEE80211_RADIOTAP_MCS_BW_40)
rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
break;
+ case IEEE80211_RADIOTAP_VHT:
+ vht_known = get_unaligned_le16(iterator.this_arg);
+ rate_found = true;
+
+ rate_flags = IEEE80211_TX_RC_VHT_MCS;
+ if ((vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_GI) &&
+ (iterator.this_arg[2] &
+ IEEE80211_RADIOTAP_VHT_FLAG_SGI))
+ rate_flags |= IEEE80211_TX_RC_SHORT_GI;
+ if (vht_known &
+ IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) {
+ if (iterator.this_arg[3] == 1)
+ rate_flags |=
+ IEEE80211_TX_RC_40_MHZ_WIDTH;
+ else if (iterator.this_arg[3] == 4)
+ rate_flags |=
+ IEEE80211_TX_RC_80_MHZ_WIDTH;
+ else if (iterator.this_arg[3] == 11)
+ rate_flags |=
+ IEEE80211_TX_RC_160_MHZ_WIDTH;
+ }
+
+ vht_mcs = iterator.this_arg[4] >> 4;
+ vht_nss = iterator.this_arg[4] & 0xF;
+ break;
+
/*
* Please update the file
* Documentation/networking/mac80211-injection.txt
@@ -1802,6 +1835,9 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
if (rate_flags & IEEE80211_TX_RC_MCS) {
info->control.rates[0].idx = rate;
+ } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) {
+ ieee80211_rate_set_vht(info->control.rates, vht_mcs,
+ vht_nss);
} else {
for (i = 0; i < sband->n_bitrates; i++) {
if (rate * 5 != sband->bitrates[i].bitrate)
@@ -1812,6 +1848,9 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
}
}
+ if (info->control.rates[0].idx < 0)
+ info->control.flags &= ~IEEE80211_TX_CTRL_RATE_INJECT;
+
info->control.rates[0].flags = rate_flags;
info->control.rates[0].count = min_t(u8, rate_retries + 1,
local->hw.max_rate_tries);
@@ -2099,7 +2138,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
u16 info_id = 0;
struct ieee80211_chanctx_conf *chanctx_conf;
struct ieee80211_sub_if_data *ap_sdata;
- enum ieee80211_band band;
+ enum nl80211_band band;
int ret;
if (IS_ERR(sta))
@@ -2186,7 +2225,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
}
if (mppath && mpath)
- mesh_path_del(mpath->sdata, mpath->dst);
+ mesh_path_del(sdata, mpath->dst);
}
/*
@@ -2772,6 +2811,154 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
kfree_rcu(fast_tx, rcu_head);
}
+static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local,
+ struct sk_buff *skb, int headroom,
+ int *subframe_len)
+{
+ int amsdu_len = *subframe_len + sizeof(struct ethhdr);
+ int padding = (4 - amsdu_len) & 3;
+
+ if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) {
+ I802_DEBUG_INC(local->tx_expand_skb_head);
+
+ if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
+ return false;
+ }
+ }
+
+ if (padding) {
+ *subframe_len += padding;
+ memset(skb_put(skb, padding), 0, padding);
+ }
+
+ return true;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr;
+ struct ethhdr amsdu_hdr;
+ int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+ int subframe_len = skb->len - hdr_len;
+ void *data;
+ u8 *qc;
+
+ if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
+ return false;
+
+ if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+ return true;
+
+ if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr),
+ &subframe_len))
+ return false;
+
+ amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+ memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+ memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+ data = skb_push(skb, sizeof(amsdu_hdr));
+ memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+ memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+ hdr = data;
+ qc = ieee80211_get_qos_ctl(hdr);
+ *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+ info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+ return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct txq_info *txqi;
+ struct sk_buff **frag_tail, *head;
+ int subframe_len = skb->len - ETH_ALEN;
+ u8 max_subframes = sta->sta.max_amsdu_subframes;
+ int max_frags = local->hw.max_tx_fragments;
+ int max_amsdu_len = sta->sta.max_amsdu_len;
+ __be16 len;
+ void *data;
+ bool ret = false;
+ int n = 1, nfrags;
+
+ if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+ return false;
+
+ if (!txq)
+ return false;
+
+ txqi = to_txq_info(txq);
+ if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+ return false;
+
+ if (sta->sta.max_rc_amsdu_len)
+ max_amsdu_len = min_t(int, max_amsdu_len,
+ sta->sta.max_rc_amsdu_len);
+
+ spin_lock_bh(&txqi->queue.lock);
+
+ head = skb_peek_tail(&txqi->queue);
+ if (!head)
+ goto out;
+
+ if (skb->len + head->len > max_amsdu_len)
+ goto out;
+
+ if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+ goto out;
+
+ nfrags = 1 + skb_shinfo(skb)->nr_frags;
+ nfrags += 1 + skb_shinfo(head)->nr_frags;
+ frag_tail = &skb_shinfo(head)->frag_list;
+ while (*frag_tail) {
+ nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
+ frag_tail = &(*frag_tail)->next;
+ n++;
+ }
+
+ if (max_subframes && n > max_subframes)
+ goto out;
+
+ if (max_frags && nfrags > max_frags)
+ goto out;
+
+ if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2,
+ &subframe_len))
+ goto out;
+
+ ret = true;
+ data = skb_push(skb, ETH_ALEN + 2);
+ memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+ data += 2 * ETH_ALEN;
+ len = cpu_to_be16(subframe_len);
+ memcpy(data, &len, 2);
+ memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header));
+
+ head->len += skb->len;
+ head->data_len += skb->len;
+ *frag_tail = skb;
+
+out:
+ spin_unlock_bh(&txqi->queue.lock);
+
+ return ret;
+}
+
static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
struct net_device *dev, struct sta_info *sta,
struct ieee80211_fast_tx *fast_tx,
@@ -2826,6 +3013,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
ieee80211_tx_stats(dev, skb->len + extra_head);
+ if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+ ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+ return true;
+
/* will not be crypto-handled beyond what we do here, so use false
* as the may-encrypt argument for the resize to not account for
* more room than we already have in 'extra_head'
@@ -3406,7 +3597,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
struct sk_buff *skb = NULL;
struct ieee80211_tx_info *info;
struct ieee80211_sub_if_data *sdata = NULL;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_tx_rate_control txrc;
struct ieee80211_chanctx_conf *chanctx_conf;
int csa_off_base = 0;
@@ -3974,7 +4165,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid);
void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
int ac = ieee802_1d_to_ac[tid & 7];
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 7390de494..905003f75 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -59,7 +59,7 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
}
}
-int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
+int ieee80211_frame_duration(enum nl80211_band band, size_t len,
int rate, int erp, int short_preamble,
int shift)
{
@@ -77,7 +77,7 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
* is assumed to be 0 otherwise.
*/
- if (band == IEEE80211_BAND_5GHZ || erp) {
+ if (band == NL80211_BAND_5GHZ || erp) {
/*
* OFDM:
*
@@ -129,7 +129,7 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
/* Exported duration function for driver use */
__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
- enum ieee80211_band band,
+ enum nl80211_band band,
size_t frame_len,
struct ieee80211_rate *rate)
{
@@ -1129,7 +1129,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
rcu_read_lock();
chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
use_11b = (chanctx_conf &&
- chanctx_conf->def.chan->band == IEEE80211_BAND_2GHZ) &&
+ chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) &&
!(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE);
rcu_read_unlock();
@@ -1301,7 +1301,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
u8 *buffer, size_t buffer_len,
const u8 *ie, size_t ie_len,
- enum ieee80211_band band,
+ enum nl80211_band band,
u32 rate_mask,
struct cfg80211_chan_def *chandef,
size_t *offset)
@@ -1375,7 +1375,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
pos += ext_rates_len;
}
- if (chandef->chan && sband->band == IEEE80211_BAND_2GHZ) {
+ if (chandef->chan && sband->band == NL80211_BAND_2GHZ) {
if (end - pos < 3)
goto out_err;
*pos++ = WLAN_EID_DS_PARAMS;
@@ -1479,7 +1479,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
memset(ie_desc, 0, sizeof(*ie_desc));
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
if (bands_used & BIT(i)) {
pos += ieee80211_build_preq_ies_band(local,
buffer + pos,
@@ -1522,7 +1522,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
int ies_len;
- u32 rate_masks[IEEE80211_NUM_BANDS] = {};
+ u32 rate_masks[NUM_NL80211_BANDS] = {};
struct ieee80211_scan_ies dummy_ie_desc;
/*
@@ -1582,7 +1582,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
- enum ieee80211_band band, u32 *basic_rates)
+ enum nl80211_band band, u32 *basic_rates)
{
struct ieee80211_supported_band *sband;
size_t num_rates;
@@ -2520,7 +2520,7 @@ int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, bool need_basic,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
@@ -2565,7 +2565,7 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, bool need_basic,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
@@ -2711,7 +2711,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
/* TODO: handle HT/VHT preambles */
- if (status->band == IEEE80211_BAND_5GHZ) {
+ if (status->band == NL80211_BAND_5GHZ) {
ts += 20 << shift;
mpdu_offset += 2;
} else if (status->flag & RX_FLAG_SHORTPRE) {
@@ -2724,8 +2724,9 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
rate = cfg80211_calculate_bitrate(&ri);
if (WARN_ONCE(!rate,
- "Invalid bitrate: flags=0x%x, idx=%d, vht_nss=%d\n",
- status->flag, status->rate_idx, status->vht_nss))
+ "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n",
+ (unsigned long long)status->flag, status->rate_idx,
+ status->vht_nss))
return 0;
/* rewind from end of MPDU */
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index e590e2ef9..ee715764a 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -418,7 +418,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
@@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 18848258a..b48c1e13e 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -504,25 +504,31 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
!ieee80211_is_robust_mgmt_frame(skb))
return RX_CONTINUE;
- data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len;
- if (!rx->sta || data_len < 0)
- return RX_DROP_UNUSABLE;
-
if (status->flag & RX_FLAG_DECRYPTED) {
if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_CCMP_HDR_LEN))
return RX_DROP_UNUSABLE;
+ if (status->flag & RX_FLAG_MIC_STRIPPED)
+ mic_len = 0;
} else {
if (skb_linearize(rx->skb))
return RX_DROP_UNUSABLE;
}
+ data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len;
+ if (!rx->sta || data_len < 0)
+ return RX_DROP_UNUSABLE;
+
if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
+ int res;
+
ccmp_hdr2pn(pn, skb->data + hdrlen);
queue = rx->security_idx;
- if (memcmp(pn, key->u.ccmp.rx_pn[queue],
- IEEE80211_CCMP_PN_LEN) <= 0) {
+ res = memcmp(pn, key->u.ccmp.rx_pn[queue],
+ IEEE80211_CCMP_PN_LEN);
+ if (res < 0 ||
+ (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
key->u.ccmp.replays++;
return RX_DROP_UNUSABLE;
}
@@ -720,8 +726,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
struct sk_buff *skb = rx->skb;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
u8 pn[IEEE80211_GCMP_PN_LEN];
- int data_len;
- int queue;
+ int data_len, queue, mic_len = IEEE80211_GCMP_MIC_LEN;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -729,26 +734,31 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
!ieee80211_is_robust_mgmt_frame(skb))
return RX_CONTINUE;
- data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN -
- IEEE80211_GCMP_MIC_LEN;
- if (!rx->sta || data_len < 0)
- return RX_DROP_UNUSABLE;
-
if (status->flag & RX_FLAG_DECRYPTED) {
if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_GCMP_HDR_LEN))
return RX_DROP_UNUSABLE;
+ if (status->flag & RX_FLAG_MIC_STRIPPED)
+ mic_len = 0;
} else {
if (skb_linearize(rx->skb))
return RX_DROP_UNUSABLE;
}
+ data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len;
+ if (!rx->sta || data_len < 0)
+ return RX_DROP_UNUSABLE;
+
if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
+ int res;
+
gcmp_hdr2pn(pn, skb->data + hdrlen);
queue = rx->security_idx;
- if (memcmp(pn, key->u.gcmp.rx_pn[queue],
- IEEE80211_GCMP_PN_LEN) <= 0) {
+ res = memcmp(pn, key->u.gcmp.rx_pn[queue],
+ IEEE80211_GCMP_PN_LEN);
+ if (res < 0 ||
+ (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
key->u.gcmp.replays++;
return RX_DROP_UNUSABLE;
}
@@ -772,7 +782,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
}
/* Remove GCMP header and MIC */
- if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN))
+ if (pskb_trim(skb, skb->len - mic_len))
return RX_DROP_UNUSABLE;
memmove(skb->data + IEEE80211_GCMP_HDR_LEN, skb->data, hdrlen);
skb_pull(skb, IEEE80211_GCMP_HDR_LEN);
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 0183b32da..2055e57ed 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -26,14 +26,6 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
netdev_features_t mpls_features;
__be16 mpls_protocol;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN)))
- goto out;
-
/* Setup inner SKB. */
mpls_protocol = skb->protocol;
skb->protocol = skb->inner_protocol;
@@ -56,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
* skb_mac_gso_segment(), an indirect caller of this function.
*/
__skb_pull(skb, skb->data - skb_mac_header(skb));
-out:
+
return segs;
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 85ca189bd..096a45103 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
+static void ip_vs_conn_expire(unsigned long data);
/*
* Returns hash value for IPVS connection entry
@@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
}
EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp)
+{
+ __ip_vs_conn_put(cp);
+ ip_vs_conn_expire((unsigned long)cp);
+}
+
/*
* Put back the conn and restart its timer with its timeout
*/
-void ip_vs_conn_put(struct ip_vs_conn *cp)
+static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
{
unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
0 : cp->timeout;
@@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
__ip_vs_conn_put(cp);
}
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
+ (atomic_read(&cp->refcnt) == 1) &&
+ !timer_pending(&cp->timer))
+ /* expire connection immediately */
+ __ip_vs_conn_put_notimer(cp);
+ else
+ __ip_vs_conn_put_timer(cp);
+}
/*
* Fill a no_client_port connection with a client port number
@@ -745,7 +762,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs,
* If available, return 1, otherwise invalidate this connection
* template and return 0.
*/
-int ip_vs_check_template(struct ip_vs_conn *ct)
+int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
{
struct ip_vs_dest *dest = ct->dest;
struct netns_ipvs *ipvs = ct->ipvs;
@@ -755,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- expire_quiescent_template(ipvs, dest)) {
+ expire_quiescent_template(ipvs, dest) ||
+ (cdest && (dest != cdest))) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
"-> d:%s:%d\n",
@@ -819,7 +837,8 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
- if (cp->flags & IP_VS_CONN_F_NFCT) {
+ if ((cp->flags & IP_VS_CONN_F_NFCT) &&
+ !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
/* Do not access conntracks during subsys cleanup
* because nf_conntrack_find_get can not be used after
* conntrack cleanup for the net.
@@ -834,7 +853,10 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ ip_vs_conn_rcu_free(&cp->rcu_head);
+ else
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
return;
}
@@ -850,7 +872,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
- ip_vs_conn_put(cp);
+ __ip_vs_conn_put_timer(cp);
}
/* Modify timer, so that it expires as soon as possible.
@@ -1240,6 +1262,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
return 1;
}
+static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
+{
+ struct ip_vs_service *svc;
+
+ if (!cp->dest)
+ return false;
+ svc = rcu_dereference(cp->dest->svc);
+ return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
+}
+
/* Called from keventd and must protect itself from softirqs */
void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
@@ -1254,11 +1286,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- /* connection template */
- continue;
if (cp->ipvs != ipvs)
continue;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ if (atomic_read(&cp->n_control) ||
+ !ip_vs_conn_ops_mode(cp))
+ continue;
+ else
+ /* connection template of OPS */
+ goto try_drop;
+ }
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1286,6 +1323,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
continue;
}
} else {
+try_drop:
if (!todrop_entry(cp))
continue;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b9a4082af..2c1b498a7 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
#ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+EXPORT_SYMBOL(ip_vs_new_conn_out);
static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
@@ -320,7 +321,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
- if (!ct || !ip_vs_check_template(ct)) {
+ if (!ct || !ip_vs_check_template(ct, NULL)) {
struct ip_vs_scheduler *sched;
/*
@@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
- atomic_inc(&cp->in_pkts);
+ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+ atomic_inc(&cp->control->in_pkts);
+ else
+ atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
@@ -1100,6 +1104,144 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
}
}
+/* Generic function to create new connections for outgoing RS packets
+ *
+ * Pre-requisites for successful connection creation:
+ * 1) Virtual Service is NOT fwmark based:
+ * In fwmark-VS actual vaddr and vport are unknown to IPVS
+ * 2) Real Server and Virtual Service were NOT configured without port:
+ * This is to allow match of different VS to the same RS ip-addr
+ */
+struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ __be16 dport,
+ __be16 cport)
+{
+ struct ip_vs_conn_param param;
+ struct ip_vs_conn *ct = NULL, *cp = NULL;
+ const union nf_inet_addr *vaddr, *daddr, *caddr;
+ union nf_inet_addr snet;
+ __be16 vport;
+ unsigned int flags;
+
+ EnterFunction(12);
+ vaddr = &svc->addr;
+ vport = svc->port;
+ daddr = &iph->saddr;
+ caddr = &iph->daddr;
+
+ /* check pre-requisites are satisfied */
+ if (svc->fwmark)
+ return NULL;
+ if (!vport || !dport)
+ return NULL;
+
+ /* for persistent service first create connection template */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+ /* apply netmask the same way ingress-side does */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ ipv6_addr_prefix(&snet.in6, &caddr->in6,
+ (__force __u32)svc->netmask);
+ else
+#endif
+ snet.ip = caddr->ip & svc->netmask;
+ /* fill params and create template if not existent */
+ if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
+ &snet, 0, vaddr,
+ vport, &param) < 0)
+ return NULL;
+ ct = ip_vs_ct_in_get(&param);
+ /* check if template exists and points to the same dest */
+ if (!ct || !ip_vs_check_template(ct, dest)) {
+ ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest, 0);
+ if (!ct) {
+ kfree(param.pe_data);
+ return NULL;
+ }
+ ct->timeout = svc->timeout;
+ } else {
+ kfree(param.pe_data);
+ }
+ }
+
+ /* connection flags */
+ flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
+ iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
+ /* create connection */
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
+ caddr, cport, vaddr, vport, &param);
+ cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
+ if (!cp) {
+ if (ct)
+ ip_vs_conn_put(ct);
+ return NULL;
+ }
+ if (ct) {
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+ }
+ ip_vs_conn_stats(cp, svc);
+
+ /* return connection (will be used to handle outgoing packet) */
+ IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
+ "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+ LeaveFunction(12);
+ return cp;
+}
+
+/* Handle outgoing packets which are considered requests initiated by
+ * real servers, so that subsequent responses from external client can be
+ * routed to the right real server.
+ * Used also for outgoing responses in OPS mode.
+ *
+ * Connection management is handled by persistent-engine specific callback.
+ */
+static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
+ struct netns_ipvs *ipvs,
+ int af, struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *cp = NULL;
+ __be16 _ports[2], *pptr;
+
+ if (hooknum == NF_INET_LOCAL_IN)
+ return NULL;
+
+ pptr = frag_safe_skb_hp(skb, iph->len,
+ sizeof(_ports), _ports, iph);
+ if (!pptr)
+ return NULL;
+
+ rcu_read_lock();
+ dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
+ &iph->saddr, pptr[0]);
+ if (dest) {
+ struct ip_vs_service *svc;
+ struct ip_vs_pe *pe;
+
+ svc = rcu_dereference(dest->svc);
+ if (svc) {
+ pe = rcu_dereference(svc->pe);
+ if (pe && pe->conn_out)
+ cp = pe->conn_out(svc, dest, skb, iph,
+ pptr[0], pptr[1]);
+ }
+ }
+ rcu_read_unlock();
+
+ return cp;
+}
+
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
@@ -1245,6 +1387,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
+
+ /* Check for real-server-started requests */
+ if (atomic_read(&ipvs->conn_out_counter)) {
+ /* Currently only for UDP:
+ * connection oriented protocols typically use
+ * ephemeral ports for outgoing connections, so
+ * related incoming responses would not match any VS
+ */
+ if (pp->protocol == IPPROTO_UDP) {
+ cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
+ if (likely(cp))
+ return handle_response(af, skb, pd, cp, &iph,
+ hooknum);
+ }
+ }
+
if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
@@ -1837,6 +1995,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
+ else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+ /* increment is done inside ip_vs_sync_conn too */
+ atomic_inc(&cp->control->in_pkts);
ip_vs_conn_put(cp);
return ret;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 404b2a4f4..c3c809b2e 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
return false;
}
+/* Find real service record by <proto,addr,port>.
+ * In case of multiple records with the same <proto,addr,port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
+ __u16 protocol,
+ const union nf_inet_addr *daddr,
+ __be16 dport)
+{
+ unsigned int hash;
+ struct ip_vs_dest *dest;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
/* Lookup destination by {addr,port} in the given service
* Called under RCU lock.
*/
@@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
+ if (svc->pe && svc->pe->conn_out)
+ atomic_inc(&ipvs->conn_out_counter);
ip_vs_start_estimator(ipvs, &svc->stats);
@@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
+ bool new_pe_conn_out, old_pe_conn_out;
/*
* Lookup the scheduler, by 'u->sched_name'
@@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1);
- if (pe != old_pe)
+ if (pe != old_pe) {
rcu_assign_pointer(svc->pe, pe);
+ /* check for optional methods in new pe */
+ new_pe_conn_out = (pe && pe->conn_out) ? true : false;
+ old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
+ if (new_pe_conn_out && !old_pe_conn_out)
+ atomic_inc(&svc->ipvs->conn_out_counter);
+ if (old_pe_conn_out && !new_pe_conn_out)
+ atomic_dec(&svc->ipvs->conn_out_counter);
+ }
out:
ip_vs_scheduler_put(old_sched);
@@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (old_pe && old_pe->conn_out)
+ atomic_dec(&ipvs->conn_out_counter);
ip_vs_pe_put(old_pe);
/*
@@ -2875,8 +2918,10 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
+ IPVS_STATS_ATTR_PAD) ||
nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
@@ -2900,16 +2945,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
if (!nl_stats)
return -EMSGSIZE;
- if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) ||
- nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps))
+ if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
+ IPVS_STATS_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
+ IPVS_STATS_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(skb, nl_stats);
@@ -3957,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
(unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
+ atomic_set(&ipvs->conn_out_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 30434fb13..f04fd8df2 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return;
+ /* Never alter conntrack for OPS conns (no reply is expected) */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
/* Alter reply only in original direction */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return;
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 0a6eb5c0d..d07ef9e31 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
return cp->pe_data_len;
}
+static struct ip_vs_conn *
+ip_vs_sip_conn_out(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ __be16 dport,
+ __be16 cport)
+{
+ if (likely(iph->protocol == IPPROTO_UDP))
+ return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport);
+ /* currently no need to handle other than UDP */
+ return NULL;
+}
+
static struct ip_vs_pe ip_vs_sip_pe =
{
.name = "sip",
@@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe =
.ct_match = ip_vs_sip_ct_match,
.hashkey_raw = ip_vs_sip_hashkey_raw,
.show_pe_data = ip_vs_sip_show_pe_data,
+ .conn_out = ip_vs_sip_conn_out,
};
static int __init ip_vs_sip_init(void)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 803001a45..1b07578be 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1545,7 +1545,8 @@ error:
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id)
+static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
+ int ifindex)
{
/* multicast addr */
union ipvs_sockaddr mcast_addr;
@@ -1566,6 +1567,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id)
set_sock_size(sock->sk, 0, result);
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
+ sock->sk->sk_bound_dev_if = ifindex;
result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
@@ -1868,7 +1870,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
if (state == IP_VS_STATE_MASTER)
sock = make_send_sock(ipvs, id);
else
- sock = make_receive_sock(ipvs, id);
+ sock = make_receive_sock(ipvs, id, dev->ifindex);
if (IS_ERR(sock)) {
result = PTR_ERR(sock);
goto outtinfo;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index dc196a0f5..01d3d894d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -932,17 +932,14 @@ error:
static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
{
- if (encaps_af == AF_INET) {
- if (orig_af == AF_INET)
- return SKB_GSO_IPIP;
-
- return SKB_GSO_SIT;
+ switch (encaps_af) {
+ case AF_INET:
+ return SKB_GSO_IPXIP4;
+ case AF_INET6:
+ return SKB_GSO_IPXIP6;
+ default:
+ return 0;
}
-
- /* GSO: we need to provide proper SKB_GSO_ value for IPv6:
- * SKB_GSO_SIT/IPV6
- */
- return 0;
}
/*
@@ -1013,8 +1010,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af));
- if (IS_ERR(skb))
+ if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
goto tx_error;
skb->transport_header = skb->network_header;
@@ -1105,8 +1101,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af));
- if (IS_ERR(skb))
+ if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
goto tx_error;
skb->transport_header = skb->network_header;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e27fd17c6..9f530adad 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -12,6 +12,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
@@ -52,6 +54,7 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <net/netns/hash.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -66,6 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash);
+
+static __read_mostly struct kmem_cache *nf_conntrack_cachep;
+static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly seqcount_t nf_conntrack_generation;
static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
@@ -105,7 +114,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
spin_lock_nested(&nf_conntrack_locks[h1],
SINGLE_DEPTH_NESTING);
}
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
nf_conntrack_double_unlock(h1, h2);
return true;
}
@@ -139,43 +148,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-unsigned int nf_conntrack_hash_rnd __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ const struct net *net)
{
unsigned int n;
+ u32 seed;
+
+ get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
+ seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
+ return jhash2((u32 *)tuple, n, seed ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
}
-static u32 __hash_bucket(u32 hash, unsigned int size)
-{
- return reciprocal_scale(hash, size);
-}
-
-static u32 hash_bucket(u32 hash, const struct net *net)
+static u32 scale_hash(u32 hash)
{
- return __hash_bucket(hash, net->ct.htable_size);
+ return reciprocal_scale(hash, nf_conntrack_htable_size);
}
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- unsigned int size)
+static u32 __hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int size)
{
- return __hash_bucket(hash_conntrack_raw(tuple), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
}
-static inline u_int32_t hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, net->ct.htable_size);
+ return scale_hash(hash_conntrack_raw(tuple, net));
}
bool
@@ -356,7 +365,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
}
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
- if (l4proto && l4proto->destroy)
+ if (l4proto->destroy)
l4proto->destroy(ct);
rcu_read_unlock();
@@ -391,7 +400,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -443,7 +452,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -452,7 +462,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
*/
return nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
- nf_ct_is_confirmed(ct);
+ nf_ct_is_confirmed(ct) &&
+ net_eq(net, nf_ct_net(ct));
}
/*
@@ -465,21 +476,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int bucket = hash_bucket(hash, net);
+ unsigned int bucket, sequence;
- /* Disable BHs the entire time since we normally need to disable them
- * at least once for the stats anyway.
- */
- local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
- if (nf_ct_key_equal(h, tuple, zone)) {
- NF_CT_STAT_INC(net, found);
- local_bh_enable();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ bucket = scale_hash(hash);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
+ if (nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
return h;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
/*
* if the nulls value we got at the end of this lookup is
@@ -487,10 +500,9 @@ begin:
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(n) != bucket) {
- NF_CT_STAT_INC(net, search_restart);
+ NF_CT_STAT_INC_ATOMIC(net, search_restart);
goto begin;
}
- local_bh_enable();
return NULL;
}
@@ -512,7 +524,7 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
nf_ct_put(ct);
goto begin;
}
@@ -528,7 +540,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple));
+ hash_conntrack_raw(tuple, net));
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -536,12 +548,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
unsigned int reply_hash)
{
- struct net *net = nf_ct_net(ct);
-
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &net->ct.hash[hash]);
+ &nf_conntrack_hash[hash]);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
- &net->ct.hash[reply_hash]);
+ &nf_conntrack_hash[reply_hash]);
}
int
@@ -558,7 +568,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -566,17 +576,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
add_timer(&ct->timeout);
@@ -597,6 +604,63 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+static inline void nf_ct_acct_update(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int len)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+}
+
+static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ const struct nf_conn *loser_ct)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(loser_ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+ unsigned int bytes;
+
+ /* u32 should be fine since we must have seen one packet. */
+ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
+ nf_ct_acct_update(ct, ctinfo, bytes);
+ }
+}
+
+/* Resolve race on insertion if this protocol allows this. */
+static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct nf_conntrack_l4proto *l4proto;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->allow_clash &&
+ !nfct_nat(ct) &&
+ !nf_ct_is_dying(ct) &&
+ atomic_inc_not_zero(&ct->ct_general.use)) {
+ nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
+ nf_conntrack_put(skb->nfct);
+ /* Assign conntrack already in hashes to this skbuff. Don't
+ * modify skb->nfctinfo to ensure consistent stateful filtering.
+ */
+ skb->nfct = &ct->ct_general;
+ return NF_ACCEPT;
+ }
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
@@ -611,6 +675,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
+ int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
@@ -626,10 +691,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
- hash = hash_bucket(hash, net);
+ hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -653,23 +718,22 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- if (unlikely(nf_ct_is_dying(ct)))
- goto out;
+ if (unlikely(nf_ct_is_dying(ct))) {
+ nf_ct_add_to_dying_list(ct);
+ goto dying;
+ }
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
/* Timer relative to confirmation time, not original
@@ -708,10 +772,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
out:
nf_ct_add_to_dying_list(ct);
+ ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return NF_DROP;
+ return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -724,29 +790,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
struct net *net = nf_ct_net(ignored_conntrack);
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
+ unsigned int hash, sequence;
struct hlist_nulls_node *n;
struct nf_conn *ct;
- unsigned int hash;
zone = nf_ct_zone(ignored_conntrack);
- hash = hash_conntrack(net, tuple);
- /* Disable BHs the entire time since we need to disable them at
- * least once for the stats anyway.
- */
- rcu_read_lock_bh();
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ rcu_read_lock();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ hash = hash_conntrack(net, tuple);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
- nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
- NF_CT_STAT_INC(net, found);
- rcu_read_unlock_bh();
+ nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
+ rcu_read_unlock();
return 1;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return 0;
}
@@ -760,71 +828,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
{
/* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
- struct nf_conn *ct = NULL, *tmp;
+ struct nf_conn *tmp;
struct hlist_nulls_node *n;
- unsigned int i = 0, cnt = 0;
- int dropped = 0;
- unsigned int hash, sequence;
+ unsigned int i, hash, sequence;
+ struct nf_conn *ct = NULL;
spinlock_t *lockp;
+ bool ret = false;
+
+ i = 0;
local_bh_disable();
restart:
- sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_bucket(_hash, net);
- for (; i < net->ct.htable_size; i++) {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ for (; i < NF_CT_EVICTION_RANGE; i++) {
+ hash = scale_hash(_hash++);
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
spin_unlock(lockp);
goto restart;
}
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
- hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
+ hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
- !nf_ct_is_dying(tmp) &&
- atomic_inc_not_zero(&tmp->ct_general.use)) {
+
+ if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
+ !net_eq(nf_ct_net(tmp), net) ||
+ nf_ct_is_dying(tmp))
+ continue;
+
+ if (atomic_inc_not_zero(&tmp->ct_general.use)) {
ct = tmp;
break;
}
- cnt++;
}
- hash = (hash + 1) % net->ct.htable_size;
spin_unlock(lockp);
-
- if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ if (ct)
break;
-
}
+
local_bh_enable();
if (!ct)
- return dropped;
+ return false;
- if (del_timer(&ct->timeout)) {
+ /* kill only if in same netns -- might have moved due to
+ * SLAB_DESTROY_BY_RCU rules
+ */
+ if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) {
if (nf_ct_delete(ct, 0, 0)) {
- dropped = 1;
NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ ret = true;
}
}
- nf_ct_put(ct);
- return dropped;
-}
-void init_nf_conntrack_hash_rnd(void)
-{
- unsigned int rand;
-
- /*
- * Why not initialize nf_conntrack_rnd in a "init()" function ?
- * Because there isn't enough entropy when system initializing,
- * and we initialize it as late as possible.
- */
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+ nf_ct_put(ct);
+ return ret;
}
static struct nf_conn *
@@ -836,12 +896,6 @@ __nf_conntrack_alloc(struct net *net,
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- /* recompute the hash as nf_conntrack_hash_rnd is initialized */
- hash = hash_conntrack_raw(orig);
- }
-
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);
@@ -858,7 +912,7 @@ __nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
goto out;
@@ -885,7 +939,7 @@ __nf_conntrack_alloc(struct net *net,
atomic_set(&ct->ct_general.use, 0);
return ct;
out_free:
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
@@ -912,7 +966,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
nf_ct_ext_free(ct);
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
smp_mb__before_atomic();
atomic_dec(&net->ct.count);
}
@@ -966,7 +1020,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
- pr_debug("init conntrack: can't track with proto module\n");
+ pr_debug("can't track with proto module\n");
return NULL;
}
@@ -988,7 +1042,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
spin_lock(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ pr_debug("expectation arrives ct=%p exp=%p\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
@@ -1053,13 +1107,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
- pr_debug("resolve_normal_ct: Can't get tuple\n");
+ pr_debug("Can't get tuple\n");
return NULL;
}
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple);
+ hash = hash_conntrack_raw(&tuple, net);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1079,14 +1133,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+ pr_debug("normal packet for %p\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: related packet for %p\n",
- ct);
+ pr_debug("related packet for %p\n", ct);
*ctinfo = IP_CT_RELATED;
} else {
- pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+ pr_debug("new packet for %p\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
@@ -1269,17 +1322,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
}
acct:
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1288,18 +1332,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
const struct sk_buff *skb,
int do_acct)
{
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len - skb_network_offset(skb),
- &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
if (del_timer(&ct->timeout)) {
ct->timeout.function((unsigned long)ct);
@@ -1395,16 +1429,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
int cpu;
spinlock_t *lockp;
- for (; *bucket < net->ct.htable_size; (*bucket)++) {
+ for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (*bucket < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
+ if (net_eq(nf_ct_net(ct), net) &&
+ iter(ct, data))
goto found;
}
}
@@ -1442,6 +1477,9 @@ void nf_ct_iterate_cleanup(struct net *net,
might_sleep();
+ if (atomic_read(&net->ct.count) == 0)
+ return;
+
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
@@ -1493,6 +1531,8 @@ void nf_conntrack_cleanup_end(void)
while (untrack_refs() > 0)
schedule();
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+
#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
#endif
@@ -1505,6 +1545,8 @@ void nf_conntrack_cleanup_end(void)
nf_conntrack_tstamp_fini();
nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
+
+ kmem_cache_destroy(nf_conntrack_cachep);
}
/*
@@ -1543,15 +1585,12 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_proto_pernet_fini(net);
nf_conntrack_helper_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_tstamp_pernet_fini(net);
nf_conntrack_acct_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
free_percpu(net->ct.stat);
free_percpu(net->ct.pcpu_lists);
}
@@ -1563,8 +1602,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
unsigned int nr_slots, i;
size_t sz;
+ if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
+ return NULL;
+
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
+
+ if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
+ return NULL;
+
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
@@ -1606,7 +1652,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
local_bh_disable();
nf_conntrack_all_lock();
- write_seqcount_begin(&init_net.ct.generation);
+ write_seqcount_begin(&nf_conntrack_generation);
/* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections
@@ -1614,26 +1660,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
* though since that required taking the locks.
*/
- for (i = 0; i < init_net.ct.htable_size; i++) {
- while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
- h = hlist_nulls_entry(init_net.ct.hash[i].first,
- struct nf_conntrack_tuple_hash, hnnode);
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
+ while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ h = hlist_nulls_entry(nf_conntrack_hash[i].first,
+ struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
- bucket = __hash_conntrack(&h->tuple, hashsize);
+ bucket = __hash_conntrack(nf_ct_net(ct),
+ &h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = init_net.ct.htable_size;
- old_hash = init_net.ct.hash;
+ old_size = nf_conntrack_htable_size;
+ old_hash = nf_conntrack_hash;
- init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash = hash;
+ nf_conntrack_hash = hash;
+ nf_conntrack_htable_size = hashsize;
- write_seqcount_end(&init_net.ct.generation);
+ write_seqcount_end(&nf_conntrack_generation);
nf_conntrack_all_unlock();
local_bh_enable();
+ synchronize_net();
nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
@@ -1654,7 +1702,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
int nf_conntrack_init_start(void)
{
int max_factor = 8;
- int i, ret, cpu;
+ int ret = -ENOMEM;
+ int i, cpu;
+
+ seqcount_init(&nf_conntrack_generation);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
@@ -1681,8 +1732,19 @@ int nf_conntrack_init_start(void)
* entries. */
max_factor = 4;
}
+
+ nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
+ if (!nf_conntrack_hash)
+ return -ENOMEM;
+
nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!nf_conntrack_cachep)
+ goto err_cachep;
+
printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
@@ -1759,6 +1821,9 @@ err_tstamp:
err_acct:
nf_conntrack_expect_fini();
err_expect:
+ kmem_cache_destroy(nf_conntrack_cachep);
+err_cachep:
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
return ret;
}
@@ -1778,12 +1843,10 @@ void nf_conntrack_init_end(void)
int nf_conntrack_init_net(struct net *net)
{
- static atomic64_t unique_id;
int ret = -ENOMEM;
int cpu;
atomic_set(&net->ct.count, 0);
- seqcount_init(&net->ct.generation);
net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
if (!net->ct.pcpu_lists)
@@ -1801,25 +1864,6 @@ int nf_conntrack_init_net(struct net *net)
if (!net->ct.stat)
goto err_pcpu_lists;
- net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu",
- (u64)atomic64_inc_return(&unique_id));
- if (!net->ct.slabname)
- goto err_slabname;
-
- net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
- sizeof(struct nf_conn), 0,
- SLAB_DESTROY_BY_RCU, NULL);
- if (!net->ct.nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- goto err_cache;
- }
-
- net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
- if (!net->ct.hash) {
- printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
- goto err_hash;
- }
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
@@ -1851,12 +1895,6 @@ err_tstamp:
err_acct:
nf_conntrack_expect_pernet_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
-err_hash:
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
-err_cache:
- kfree(net->ct.slabname);
-err_slabname:
free_percpu(net->ct.stat);
err_pcpu_lists:
free_percpu(net->ct.pcpu_lists);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 4e78c57b8..d28011b42 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -113,6 +113,60 @@ static void ecache_work(struct work_struct *work)
schedule_delayed_work(&ctnet->ecache_dwork, delay);
}
+int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
+ u32 portid, int report)
+{
+ int ret = 0;
+ struct net *net = nf_ct_net(ct);
+ struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
+
+ rcu_read_lock();
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (!notify)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(ct);
+ if (!e)
+ goto out_unlock;
+
+ if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
+ struct nf_ct_event item = {
+ .ct = ct,
+ .portid = e->portid ? e->portid : portid,
+ .report = report
+ };
+ /* This is a resent of a destroy event? If so, skip missed */
+ unsigned long missed = e->portid ? 0 : e->missed;
+
+ if (!((eventmask | missed) & e->ctmask))
+ goto out_unlock;
+
+ ret = notify->fcn(eventmask | missed, &item);
+ if (unlikely(ret < 0 || missed)) {
+ spin_lock_bh(&ct->lock);
+ if (ret < 0) {
+ /* This is a destroy event that has been
+ * triggered by a process, we store the PORTID
+ * to include it in the retransmission.
+ */
+ if (eventmask & (1 << IPCT_DESTROY) &&
+ e->portid == 0 && portid != 0)
+ e->portid = portid;
+ else
+ e->missed |= eventmask;
+ } else {
+ e->missed &= ~missed;
+ }
+ spin_unlock_bh(&ct->lock);
+ }
+ }
+out_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
+
/* deliver cached events and clear cache entry - must be called with locally
* disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
@@ -167,6 +221,36 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
+ struct nf_conntrack_expect *exp,
+ u32 portid, int report)
+
+{
+ struct net *net = nf_ct_exp_net(exp);
+ struct nf_exp_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
+
+ rcu_read_lock();
+ notify = rcu_dereference(net->ct.nf_expect_event_cb);
+ if (!notify)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(exp->master);
+ if (!e)
+ goto out_unlock;
+
+ if (e->expmask & (1 << event)) {
+ struct nf_exp_event item = {
+ .exp = exp,
+ .portid = portid,
+ .report = report
+ };
+ notify->fcn(1 << event, &item);
+ }
+out_unlock:
+ rcu_read_unlock();
+}
+
int nf_conntrack_register_notifier(struct net *net,
struct nf_ct_event_notifier *new)
{
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 278927ab0..9e3693128 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -24,6 +24,7 @@
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
+#include <net/netns/hash.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -35,9 +36,13 @@
unsigned int nf_ct_expect_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+struct hlist_head *nf_ct_expect_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
+
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+static unsigned int nf_ct_expect_hashrnd __read_mostly;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
nf_ct_expect_put(exp);
}
-static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
+static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- unsigned int hash;
+ unsigned int hash, seed;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- }
+ get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
+
+ seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
(((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
+ (__force __u16)tuple->dst.u.all) ^ seed);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
+static bool
+nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_expect *i,
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
+{
+ return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ net_eq(net, nf_ct_net(i->master)) &&
+ nf_ct_zone_equal_any(i->master, zone);
+}
+
struct nf_conntrack_expect *
__nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
@@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net,
if (!net->ct.expect_count)
return NULL;
- h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
- if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone_equal_any(i->master, zone))
+ h = nf_ct_expect_dst_hash(net, tuple);
+ hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) {
+ if (nf_ct_exp_equal(tuple, i, zone, net))
return i;
}
return NULL;
@@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net,
if (!net->ct.expect_count)
return NULL;
- h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
+ h = nf_ct_expect_dst_hash(net, tuple);
+ hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
- nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone_equal_any(i->master, zone)) {
+ nf_ct_exp_equal(tuple, i, zone, net)) {
exp = i;
break;
}
@@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
}
return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
+ net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
@@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
return a->master == b->master && a->class == b->class &&
nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
@@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
- unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
+ unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple);
/* two references : one for hash insert, one for the timer */
atomic_add(2, &exp->use);
@@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
hlist_add_head(&exp->lnode, &master_help->expectations);
master_help->expecting[exp->class]++;
- hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+ hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
@@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
ret = -ESHUTDOWN;
goto out;
}
- h = nf_ct_expect_dst_hash(&expect->tuple);
- hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
+ h = nf_ct_expect_dst_hash(net, &expect->tuple);
+ hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
if (del_timer(&i->timeout)) {
nf_ct_unlink_expect(i);
@@ -468,12 +484,11 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
@@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
@@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
- int err = -ENOMEM;
-
net->ct.expect_count = 0;
- net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
- if (net->ct.expect_hash == NULL)
- goto err1;
-
- err = exp_proc_init(net);
- if (err < 0)
- goto err2;
-
- return 0;
-err2:
- nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
-err1:
- return err;
+ return exp_proc_init(net);
}
void nf_conntrack_expect_pernet_fini(struct net *net)
{
exp_proc_remove(net);
- nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
}
int nf_conntrack_expect_init(void)
@@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void)
0, 0, NULL);
if (!nf_ct_expect_cachep)
return -ENOMEM;
+
+ nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
+ if (!nf_ct_expect_hash) {
+ kmem_cache_destroy(nf_ct_expect_cachep);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void)
{
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
+ nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 883c691ec..19efeba02 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -632,6 +632,7 @@ static int __init nf_conntrack_ftp_init(void)
if (ret) {
pr_err("failed to register helper for pf: %d port: %d\n",
ftp[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_ftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 3b40ec575..196cb3964 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = true;
+static bool nf_ct_auto_assign_helper __read_mostly = false;
module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 1)");
+ "Enable automatic conntrack helper assignment (default 0)");
#ifdef CONFIG_SYSCTL
static struct ctl_table helper_sysctl_table[] = {
@@ -361,9 +361,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_log);
int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
{
- int ret = 0;
- struct nf_conntrack_helper *cur;
+ struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
unsigned int h = helper_hash(&me->tuple);
+ struct nf_conntrack_helper *cur;
+ int ret = 0;
BUG_ON(me->expect_policy == NULL);
BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
@@ -371,9 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
mutex_lock(&nf_ct_helper_mutex);
hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
- if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
- cur->tuple.src.l3num == me->tuple.src.l3num &&
- cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+ if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) {
ret = -EEXIST;
goto out;
}
@@ -400,7 +399,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i], hnode) {
+ &nf_ct_expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
if ((rcu_dereference_protected(
help->helper,
@@ -424,10 +423,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_unlock_bh(&pcpu->lock);
}
local_bh_disable();
- for (i = 0; i < net->ct.htable_size; i++) {
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ if (i < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
unhelp(h, me);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 8b6da2719..f97ac61d2 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -271,6 +271,7 @@ static int __init nf_conntrack_irc_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
irc[i].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_irc_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 3ce5c314e..252e6a7cd 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -16,28 +16,11 @@
static spinlock_t nf_connlabels_lock;
-static unsigned int label_bits(const struct nf_conn_labels *l)
-{
- unsigned int longs = l->words;
- return longs * BITS_PER_LONG;
-}
-
-bool nf_connlabel_match(const struct nf_conn *ct, u16 bit)
-{
- struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
- if (!labels)
- return false;
-
- return bit < label_bits(labels) && test_bit(bit, labels->bits);
-}
-EXPORT_SYMBOL_GPL(nf_connlabel_match);
-
int nf_connlabel_set(struct nf_conn *ct, u16 bit)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
- if (!labels || bit >= label_bits(labels))
+ if (!labels || BIT_WORD(bit) >= labels->words)
return -ENOSPC;
if (test_bit(bit, labels->bits))
@@ -50,14 +33,18 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
}
EXPORT_SYMBOL_GPL(nf_connlabel_set);
-static void replace_u32(u32 *address, u32 mask, u32 new)
+static int replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
do {
old = *address;
tmp = (old & mask) ^ new;
+ if (old == tmp)
+ return 0;
} while (cmpxchg(address, old, tmp) != old);
+
+ return 1;
}
int nf_connlabels_replace(struct nf_conn *ct,
@@ -66,6 +53,7 @@ int nf_connlabels_replace(struct nf_conn *ct,
{
struct nf_conn_labels *labels;
unsigned int size, i;
+ int changed = 0;
u32 *dst;
labels = nf_ct_labels_find(ct);
@@ -77,29 +65,27 @@ int nf_connlabels_replace(struct nf_conn *ct,
words32 = size / sizeof(u32);
dst = (u32 *) labels->bits;
- if (words32) {
- for (i = 0; i < words32; i++)
- replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
- }
+ for (i = 0; i < words32; i++)
+ changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
size /= sizeof(u32);
for (i = words32; i < size; i++) /* pad */
replace_u32(&dst[i], 0, 0);
- nf_conntrack_event_cache(IPCT_LABEL, ct);
+ if (changed)
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_replace);
-int nf_connlabels_get(struct net *net, unsigned int n_bits)
+int nf_connlabels_get(struct net *net, unsigned int bits)
{
size_t words;
- if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+ words = BIT_WORD(bits) + 1;
+ if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long))
return -ERANGE;
- words = BITS_TO_LONGS(n_bits);
-
spin_lock(&nf_connlabels_lock);
net->ct.labels_used++;
if (words > net->ct.label_words)
@@ -128,6 +114,8 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
int nf_conntrack_labels_init(void)
{
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
spin_lock_init(&nf_connlabels_lock);
return nf_ct_extend_register(&labels_extend);
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 355e8552f..a18d1ceab 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -58,10 +58,9 @@ MODULE_LICENSE("GPL");
static char __initdata version[] = "0.93";
-static inline int
-ctnetlink_dump_tuples_proto(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_l4proto *l4proto)
+static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
struct nlattr *nest_parms;
@@ -83,10 +82,9 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_tuples_ip(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_l3proto *l3proto)
+static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto)
{
int ret = 0;
struct nlattr *nest_parms;
@@ -106,9 +104,8 @@ nla_put_failure:
return -1;
}
-static int
-ctnetlink_dump_tuples(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
+static int ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
{
int ret;
struct nf_conntrack_l3proto *l3proto;
@@ -127,9 +124,8 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
return ret;
}
-static inline int
-ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
- const struct nf_conntrack_zone *zone, int dir)
+static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
+ const struct nf_conntrack_zone *zone, int dir)
{
if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
return 0;
@@ -141,8 +137,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
goto nla_put_failure;
@@ -152,8 +147,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
{
long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
@@ -168,8 +162,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
{
struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -193,8 +186,8 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
+ const struct nf_conn *ct)
{
struct nlattr *nest_helper;
const struct nf_conn_help *help = nfct_help(ct);
@@ -245,8 +238,10 @@ dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
if (!nest_count)
goto nla_put_failure;
- if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) ||
- nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)))
+ if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts),
+ CTA_COUNTERS_PAD) ||
+ nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes),
+ CTA_COUNTERS_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest_count);
@@ -287,9 +282,11 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
if (!nest_count)
goto nla_put_failure;
- if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) ||
+ if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start),
+ CTA_TIMESTAMP_PAD) ||
(tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
- cpu_to_be64(tstamp->stop))))
+ cpu_to_be64(tstamp->stop),
+ CTA_TIMESTAMP_PAD)))
goto nla_put_failure;
nla_nest_end(skb, nest_count);
@@ -300,8 +297,7 @@ nla_put_failure:
}
#ifdef CONFIG_NF_CONNTRACK_MARK
-static inline int
-ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
goto nla_put_failure;
@@ -315,8 +311,7 @@ nla_put_failure:
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
-static inline int
-ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
int len, ret;
@@ -345,7 +340,7 @@ nla_put_failure:
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
-static int ctnetlink_label_size(const struct nf_conn *ct)
+static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -380,8 +375,7 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
-static inline int
-ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_parms;
@@ -426,8 +420,8 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb,
+ const struct nf_conn *ct)
{
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
struct nf_ct_seqadj *seq;
@@ -446,8 +440,7 @@ ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
return 0;
}
-static inline int
-ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
goto nla_put_failure;
@@ -457,8 +450,7 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
goto nla_put_failure;
@@ -538,8 +530,7 @@ nla_put_failure:
return -1;
}
-static inline size_t
-ctnetlink_proto_size(const struct nf_conn *ct)
+static inline size_t ctnetlink_proto_size(const struct nf_conn *ct)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -556,19 +547,17 @@ ctnetlink_proto_size(const struct nf_conn *ct)
return len;
}
-static inline size_t
-ctnetlink_acct_size(const struct nf_conn *ct)
+static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
return 0;
return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
- + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
- + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
+ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
+ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
;
}
-static inline int
-ctnetlink_secctx_size(const struct nf_conn *ct)
+static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
int len, ret;
@@ -584,20 +573,19 @@ ctnetlink_secctx_size(const struct nf_conn *ct)
#endif
}
-static inline size_t
-ctnetlink_timestamp_size(const struct nf_conn *ct)
+static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
return 0;
- return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+ return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t));
#else
return 0;
#endif
}
-static inline size_t
-ctnetlink_nlmsg_size(const struct nf_conn *ct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
return NLMSG_ALIGN(sizeof(struct nfgenmsg))
+ 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
@@ -628,7 +616,6 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
;
}
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
@@ -837,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conn *)cb->args[1];
local_bh_disable();
- for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+ for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (cb->args[0] >= net->ct.htable_size) {
+ if (cb->args[0] >= nf_conntrack_htable_size) {
spin_unlock(lockp);
goto out;
}
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
- hnnode) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
+ hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
@@ -891,8 +881,8 @@ out:
return skb->len;
}
-static inline int
-ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
+static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
+ struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_IP_MAX+1];
struct nf_conntrack_l3proto *l3proto;
@@ -921,9 +911,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
[CTA_PROTO_NUM] = { .type = NLA_U8 },
};
-static inline int
-ctnetlink_parse_tuple_proto(struct nlattr *attr,
- struct nf_conntrack_tuple *tuple)
+static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
+ struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_PROTO_MAX+1];
struct nf_conntrack_l4proto *l4proto;
@@ -1050,9 +1039,8 @@ static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
.len = NF_CT_HELPER_NAME_LEN - 1 },
};
-static inline int
-ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
- struct nlattr **helpinfo)
+static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
+ struct nlattr **helpinfo)
{
int err;
struct nlattr *tb[CTA_HELP_MAX+1];
@@ -1463,8 +1451,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
#endif
}
-static inline int
-ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
+static int ctnetlink_change_helper(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
struct nf_conntrack_helper *helper;
struct nf_conn_help *help = nfct_help(ct);
@@ -1524,8 +1512,8 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
return -EOPNOTSUPP;
}
-static inline int
-ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[])
+static int ctnetlink_change_timeout(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
@@ -1544,8 +1532,8 @@ static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
};
-static inline int
-ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[])
+static int ctnetlink_change_protoinfo(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
const struct nlattr *attr = cda[CTA_PROTOINFO];
struct nlattr *tb[CTA_PROTOINFO_MAX+1];
@@ -1571,8 +1559,8 @@ static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
[CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 },
};
-static inline int
-change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
+static int change_seq_adj(struct nf_ct_seqadj *seq,
+ const struct nlattr * const attr)
{
int err;
struct nlattr *cda[CTA_SEQADJ_MAX+1];
@@ -2405,10 +2393,9 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = {
* EXPECT
***********************************************************************/
-static inline int
-ctnetlink_exp_dump_tuple(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- enum ctattr_expect type)
+static int ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ enum ctattr_expect type)
{
struct nlattr *nest_parms;
@@ -2425,10 +2412,9 @@ nla_put_failure:
return -1;
}
-static inline int
-ctnetlink_exp_dump_mask(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple_mask *mask)
+static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple_mask *mask)
{
int ret;
struct nf_conntrack_l3proto *l3proto;
@@ -2646,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
- hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
+ hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]],
hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
+
+ if (!net_eq(nf_ct_net(exp->master), net))
+ continue;
+
if (cb->args[1]) {
if (exp != last)
continue;
@@ -2900,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i],
+ &nf_ct_expect_hash[i],
hnode) {
+
+ if (!net_eq(nf_ct_exp_net(exp), net))
+ continue;
+
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
@@ -2918,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
- &net->ct.expect_hash[i],
+ &nf_ct_expect_hash[i],
hnode) {
+
+ if (!net_eq(nf_ct_exp_net(exp), net))
+ continue;
+
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
NETLINK_CB(skb).portid,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index fce1b1cca..399a38fd6 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -645,7 +645,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
- cpu_to_be64(ct->proto.dccp.handshake_seq)))
+ cpu_to_be64(ct->proto.dccp.handshake_seq),
+ CTA_PROTOINFO_DCCP_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
spin_unlock_bh(&ct->lock);
@@ -660,6 +661,7 @@ static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
[CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
[CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
+ [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC },
};
static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 9578a7c37..1d7ab960a 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -191,13 +191,7 @@ static void sctp_print_tuple(struct seq_file *s,
/* Print out the private part of the conntrack. */
static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
- enum sctp_conntrack state;
-
- spin_lock_bh(&ct->lock);
- state = ct->proto.sctp.state;
- spin_unlock_bh(&ct->lock);
-
- seq_printf(s, "%s ", sctp_conntrack_names[state]);
+ seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]);
}
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7cc1d9c22..70c838164 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -313,13 +313,7 @@ static void tcp_print_tuple(struct seq_file *s,
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
- enum tcp_conntrack state;
-
- spin_lock_bh(&ct->lock);
- state = ct->proto.tcp.state;
- spin_unlock_bh(&ct->lock);
-
- seq_printf(s, "%s ", tcp_conntrack_names[state]);
+ seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
}
static unsigned int get_conntrack_index(const struct tcphdr *tcph)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 478f92f83..4fd040575 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
@@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 1ac8ee13a..9d692f5ad 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
@@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 7523a575f..3fcbaab83 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -223,6 +223,7 @@ static int __init nf_conntrack_sane_init(void)
if (ret) {
pr_err("failed to register helper for pf: %d port: %d\n",
sane[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_sane_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 3e0640273..f72ba5587 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1669,6 +1669,7 @@ static int __init nf_conntrack_sip_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
sip[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_sip_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0f1a45bca..c026c472e 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -54,14 +54,13 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < net->ct.htable_size;
+ st->bucket < nf_conntrack_htable_size;
st->bucket++) {
- n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= net->ct.htable_size)
+ if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(
- &net->ct.hash[st->bucket]));
+ &nf_conntrack_hash[st->bucket]));
}
return head;
}
@@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
- .data = &init_net.ct.htable_size,
+ .data = &nf_conntrack_htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -489,8 +487,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
{ }
};
-#define NET_NF_CONNTRACK_MAX 2089
-
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
@@ -512,7 +508,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
goto out_kmemdup;
table[1].data = &net->ct.count;
- table[2].data = &net->ct.htable_size;
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 36f964066..2e65b5430 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -142,6 +142,7 @@ static int __init nf_conntrack_tftp_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
tftp[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_tftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 06a9f4577..6877a396f 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
-hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
+ get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_conntrack_hash_rnd);
+ tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- return reciprocal_scale(hash, net->ct.nat_htable_size);
+ return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
@@ -196,9 +201,10 @@ find_appropriate_src(struct net *net,
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
- hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
+ hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) {
ct = nat->ct;
if (same_src(ct, tuple) &&
+ net_eq(net, nf_ct_net(ct)) &&
nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
@@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct,
nat = nfct_nat(ct);
nat->ct = ct;
hlist_add_head_rcu(&nat->bysource,
- &net->ct.nat_bysource[srchash]);
+ &nf_nat_bysource[srchash]);
spin_unlock_bh(&nf_nat_lock);
}
@@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static int __net_init nf_nat_net_init(struct net *net)
-{
- /* Leave them the same for the moment. */
- net->ct.nat_htable_size = net->ct.htable_size;
- net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
- if (!net->ct.nat_bysource)
- return -ENOMEM;
- return 0;
-}
-
static void __net_exit nf_nat_net_exit(struct net *net)
{
struct nf_nat_proto_clean clean = {};
nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
- synchronize_rcu();
- nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
- .init = nf_nat_net_init,
.exit = nf_nat_net_exit,
};
@@ -852,8 +845,16 @@ static int __init nf_nat_init(void)
{
int ret;
+ /* Leave them the same for the moment. */
+ nf_nat_htable_size = nf_conntrack_htable_size;
+
+ nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+ if (!nf_nat_bysource)
+ return -ENOMEM;
+
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -877,6 +878,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5baa8e24e..b19ad20a7 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -26,23 +26,21 @@
* Once the queue is registered it must reinject all packets it
* receives, no matter what.
*/
-static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
/* return EBUSY when somebody else is registered, return EEXIST if the
* same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(const struct nf_queue_handler *qh)
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
{
/* should never happen, we only have one queueing backend in kernel */
- WARN_ON(rcu_access_pointer(queue_handler));
- rcu_assign_pointer(queue_handler, qh);
+ WARN_ON(rcu_access_pointer(net->nf.queue_handler));
+ rcu_assign_pointer(net->nf.queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-void nf_unregister_queue_handler(void)
+void nf_unregister_queue_handler(struct net *net)
{
- RCU_INIT_POINTER(queue_handler, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(net->nf.queue_handler, NULL);
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
@@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
const struct nf_queue_handler *qh;
rcu_read_lock();
- qh = rcu_dereference(queue_handler);
+ qh = rcu_dereference(net->nf.queue_handler);
if (qh)
qh->nf_hook_drop(net, ops);
rcu_read_unlock();
@@ -122,9 +120,10 @@ int nf_queue(struct sk_buff *skb,
struct nf_queue_entry *entry = NULL;
const struct nf_afinfo *afinfo;
const struct nf_queue_handler *qh;
+ struct net *net = state->net;
/* QUEUE == DROP if no one is waiting, to be safe. */
- qh = rcu_dereference(queue_handler);
+ qh = rcu_dereference(net->nf.queue_handler);
if (!qh) {
status = -ESRCH;
goto err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2011977cd..cf7c74599 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -944,8 +944,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
- nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts),
+ NFTA_COUNTER_PAD) ||
+ nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
+ NFTA_COUNTER_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -975,7 +977,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+ if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
+ NFTA_CHAIN_PAD))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
goto nla_put_failure;
@@ -1721,9 +1724,11 @@ struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
err = nf_tables_newexpr(ctx, &info, expr);
if (err < 0)
- goto err2;
+ goto err3;
return expr;
+err3:
+ kfree(expr);
err2:
module_put(info.ops->type->owner);
err1:
@@ -1803,13 +1808,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+ if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle),
+ NFTA_RULE_PAD))
goto nla_put_failure;
if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
prule = list_entry(rule->list.prev, struct nft_rule, list);
if (nla_put_be64(skb, NFTA_RULE_POSITION,
- cpu_to_be64(prule->handle)))
+ cpu_to_be64(prule->handle),
+ NFTA_RULE_PAD))
goto nla_put_failure;
}
@@ -2312,7 +2319,7 @@ nft_select_set_ops(const struct nlattr * const nla[],
static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_TABLE] = { .type = NLA_STRING },
[NFTA_SET_NAME] = { .type = NLA_STRING,
- .len = IFNAMSIZ - 1 },
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
[NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
@@ -2396,7 +2403,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
unsigned long *inuse;
unsigned int n = 0, min = 0;
- p = strnchr(name, IFNAMSIZ, '%');
+ p = strnchr(name, NFT_SET_MAXNAMELEN, '%');
if (p != NULL) {
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
@@ -2473,7 +2480,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
}
if (set->timeout &&
- nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout)))
+ nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout),
+ NFTA_SET_PAD))
goto nla_put_failure;
if (set->gc_int &&
nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int)))
@@ -2641,6 +2649,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
/* Only accept unspec with dump */
if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
+ if (!nla[NFTA_SET_TABLE])
+ return -EINVAL;
set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
if (IS_ERR(set))
@@ -2690,7 +2700,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
- char name[IFNAMSIZ];
+ char name[NFT_SET_MAXNAMELEN];
unsigned int size;
bool create;
u64 timeout;
@@ -2938,24 +2948,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
* jumps are already validated for that chain.
*/
list_for_each_entry(i, &set->bindings, list) {
- if (binding->flags & NFT_SET_MAP &&
+ if (i->flags & NFT_SET_MAP &&
i->chain == binding->chain)
goto bind;
}
+ iter.genmask = nft_genmask_next(ctx->net);
iter.skip = 0;
iter.count = 0;
iter.err = 0;
iter.fn = nf_tables_bind_check_setelem;
set->ops->walk(ctx, set, &iter);
- if (iter.err < 0) {
- /* Destroy anonymous sets if binding fails */
- if (set->flags & NFT_SET_ANONYMOUS)
- nf_tables_set_destroy(ctx, set);
-
+ if (iter.err < 0)
return iter.err;
- }
}
bind:
binding->chain = ctx->chain;
@@ -3076,7 +3082,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- cpu_to_be64(*nft_set_ext_timeout(ext))))
+ cpu_to_be64(*nft_set_ext_timeout(ext)),
+ NFTA_SET_ELEM_PAD))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
@@ -3089,7 +3096,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
expires = 0;
if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- cpu_to_be64(jiffies_to_msecs(expires))))
+ cpu_to_be64(jiffies_to_msecs(expires)),
+ NFTA_SET_ELEM_PAD))
goto nla_put_failure;
}
@@ -3182,12 +3190,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto nla_put_failure;
- args.cb = cb;
- args.skb = skb;
- args.iter.skip = cb->args[0];
- args.iter.count = 0;
- args.iter.err = 0;
- args.iter.fn = nf_tables_dump_setelem;
+ args.cb = cb;
+ args.skb = skb;
+ args.iter.genmask = nft_genmask_cur(ctx.net);
+ args.iter.skip = cb->args[0];
+ args.iter.count = 0;
+ args.iter.err = 0;
+ args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&ctx, set, &args.iter);
nla_nest_end(skb, nest);
@@ -3367,6 +3376,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem)
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
+static int nft_setelem_parse_flags(const struct nft_set *set,
+ const struct nlattr *attr, u32 *flags)
+{
+ if (attr == NULL)
+ return 0;
+
+ *flags = ntohl(nla_get_be32(attr));
+ if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ if (!(set->flags & NFT_SET_INTERVAL) &&
+ *flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+
+ return 0;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
@@ -3380,8 +3405,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data data;
enum nft_registers dreg;
struct nft_trans *trans;
+ u32 flags = 0;
u64 timeout;
- u32 flags;
u8 ulen;
int err;
@@ -3395,17 +3420,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_prepare(&tmpl);
- flags = 0;
- if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
- flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
- if (flags & ~NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
- if (!(set->flags & NFT_SET_INTERVAL) &&
- flags & NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
- }
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -3574,9 +3593,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_set_ext_tmpl tmpl;
struct nft_data_desc desc;
struct nft_set_elem elem;
+ struct nft_set_ext *ext;
struct nft_trans *trans;
+ u32 flags = 0;
+ void *priv;
int err;
err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
@@ -3588,6 +3611,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_KEY] == NULL)
goto err1;
+ nft_set_ext_prepare(&tmpl);
+
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+
err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
@@ -3597,24 +3628,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
goto err2;
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len);
+
+ err = -ENOMEM;
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
+ GFP_KERNEL);
+ if (elem.priv == NULL)
+ goto err2;
+
+ ext = nft_set_elem_ext(set, elem.priv);
+ if (flags)
+ *nft_set_ext_flags(ext) = flags;
+
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (trans == NULL) {
err = -ENOMEM;
- goto err2;
+ goto err3;
}
- elem.priv = set->ops->deactivate(set, &elem);
- if (elem.priv == NULL) {
+ priv = set->ops->deactivate(set, &elem);
+ if (priv == NULL) {
err = -ENOENT;
- goto err3;
+ goto err4;
}
+ kfree(elem.priv);
+ elem.priv = priv;
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
-err3:
+err4:
kfree(trans);
+err3:
+ kfree(elem.priv);
err2:
nft_data_uninit(&elem.key.val, desc.type);
err1:
@@ -4236,6 +4283,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
binding->chain != chain)
continue;
+ iter.genmask = nft_genmask_next(ctx->net);
iter.skip = 0;
iter.count = 0;
iter.err = 0;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index e9f8dffcc..fb8b5892b 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -143,7 +143,7 @@ next_rule:
list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
/* This rule is not active, skip. */
- if (unlikely(rule->genmask & (1 << gencursor)))
+ if (unlikely(rule->genmask & gencursor))
continue;
rulenum++;
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index e9e959f65..39eb1cc62 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -156,7 +156,8 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
return 0;
return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
- cpu_to_be64(info->rule->handle));
+ cpu_to_be64(info->rule->handle),
+ NFTA_TRACE_PAD);
}
void nft_trace_notify(struct nft_traceinfo *info)
@@ -174,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
nla_total_size(NFT_TABLE_MAXNAMELEN) +
nla_total_size(NFT_CHAIN_MAXNAMELEN) +
- nla_total_size(sizeof(__be64)) + /* rule handle */
+ nla_total_size_64bit(sizeof(__be64)) + /* rule handle */
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index dbd0803b1..1b4de4bd6 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -162,15 +162,18 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
pkts = atomic64_read(&acct->pkts);
bytes = atomic64_read(&acct->bytes);
}
- if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) ||
- nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||
+ if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts),
+ NFACCT_PAD) ||
+ nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes),
+ NFACCT_PAD) ||
nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))
goto nla_put_failure;
if (acct->flags & NFACCT_F_QUOTA) {
u64 *quota = (u64 *)acct->data;
if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) ||
- nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota)))
+ nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota),
+ NFACCT_PAD))
goto nla_put_failure;
}
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 2671b9deb..3c84f1432 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
int i;
local_bh_disable();
- for (i = 0; i < net->ct.htable_size; i++) {
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ if (i < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
untimeout(h, timeout);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index cb5b630a6..5d36a0926 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -295,6 +295,59 @@ static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
return seclen;
}
+static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
+{
+ struct sk_buff *entskb = entry->skb;
+ u32 nlalen = 0;
+
+ if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
+ return 0;
+
+ if (skb_vlan_tag_present(entskb))
+ nlalen += nla_total_size(nla_total_size(sizeof(__be16)) +
+ nla_total_size(sizeof(__be16)));
+
+ if (entskb->network_header > entskb->mac_header)
+ nlalen += nla_total_size((entskb->network_header -
+ entskb->mac_header));
+
+ return nlalen;
+}
+
+static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
+{
+ struct sk_buff *entskb = entry->skb;
+
+ if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
+ return 0;
+
+ if (skb_vlan_tag_present(entskb)) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFQA_VLAN | NLA_F_NESTED);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) ||
+ nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ }
+
+ if (entskb->mac_header < entskb->network_header) {
+ int len = (int)(entskb->network_header - entskb->mac_header);
+
+ if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb)))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
@@ -334,6 +387,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (entskb->tstamp.tv64)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
+ size += nfqnl_get_bridge_size(entry);
+
if (entry->state.hook <= NF_INET_FORWARD ||
(entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
csum_verify = !skb_csum_unnecessary(entskb);
@@ -497,9 +552,12 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
}
+ if (nfqnl_put_bridge(entry, skb) < 0)
+ goto nla_put_failure;
+
if (entskb->tstamp.tv64) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -911,12 +969,18 @@ static struct notifier_block nfqnl_rtnl_notifier = {
.notifier_call = nfqnl_rcv_nl_event,
};
+static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = {
+ [NFQA_VLAN_TCI] = { .type = NLA_U16},
+ [NFQA_VLAN_PROTO] = { .type = NLA_U16},
+};
+
static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
[NFQA_PAYLOAD] = { .type = NLA_UNSPEC },
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
+ [NFQA_VLAN] = { .type = NLA_NESTED },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
@@ -1030,6 +1094,40 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
return ct;
}
+static int nfqa_parse_bridge(struct nf_queue_entry *entry,
+ const struct nlattr * const nfqa[])
+{
+ if (nfqa[NFQA_VLAN]) {
+ struct nlattr *tb[NFQA_VLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN],
+ nfqa_vlan_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO])
+ return -EINVAL;
+
+ entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]));
+ entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]);
+ }
+
+ if (nfqa[NFQA_L2HDR]) {
+ int mac_header_len = entry->skb->network_header -
+ entry->skb->mac_header;
+
+ if (mac_header_len != nla_len(nfqa[NFQA_L2HDR]))
+ return -EINVAL;
+ else if (mac_header_len > 0)
+ memcpy(skb_mac_header(entry->skb),
+ nla_data(nfqa[NFQA_L2HDR]),
+ mac_header_len);
+ }
+
+ return 0;
+}
+
static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -1045,6 +1143,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int err;
queue = instance_lookup(q, queue_num);
if (!queue)
@@ -1071,6 +1170,12 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
}
+ if (entry->state.pf == PF_BRIDGE) {
+ err = nfqa_parse_bridge(entry, nfqa);
+ if (err < 0)
+ return err;
+ }
+
if (nfqa[NFQA_PAYLOAD]) {
u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
int diff = payload_len - entry->skb->len;
@@ -1377,21 +1482,29 @@ static int __net_init nfnl_queue_net_init(struct net *net)
net->nf.proc_netfilter, &nfqnl_file_ops))
return -ENOMEM;
#endif
+ nf_register_queue_handler(net, &nfqh);
return 0;
}
static void __net_exit nfnl_queue_net_exit(struct net *net)
{
+ nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
}
+static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
+{
+ synchronize_rcu();
+}
+
static struct pernet_operations nfnl_queue_net_ops = {
- .init = nfnl_queue_net_init,
- .exit = nfnl_queue_net_exit,
- .id = &nfnl_queue_net_id,
- .size = sizeof(struct nfnl_queue_net),
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .exit_batch = nfnl_queue_net_exit_batch,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
};
static int __init nfnetlink_queue_init(void)
@@ -1412,7 +1525,6 @@ static int __init nfnetlink_queue_init(void)
}
register_netdevice_notifier(&nfqnl_dev_notifier);
- nf_register_queue_handler(&nfqh);
return status;
cleanup_netlink_notifier:
@@ -1424,7 +1536,6 @@ out:
static void __exit nfnetlink_queue_fini(void)
{
- nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index c9743f78f..77db8358a 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -76,8 +76,10 @@ static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
nft_counter_fetch(priv->counter, &total);
- if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) ||
- nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets)))
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
+ NFTA_COUNTER_PAD) ||
+ nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets),
+ NFTA_COUNTER_PAD))
goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index d4a4619fc..81fbb4507 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -54,7 +54,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
const struct nf_conn_help *help;
const struct nf_conntrack_tuple *tuple;
const struct nf_conntrack_helper *helper;
- long diff;
unsigned int state;
ct = nf_ct_get(pkt->skb, &ctinfo);
@@ -94,10 +93,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
#endif
case NFT_CT_EXPIRATION:
- diff = (long)jiffies - (long)ct->timeout.expires;
- if (diff < 0)
- diff = 0;
- *dest = jiffies_to_msecs(diff);
+ *dest = jiffies_to_msecs(nf_ct_expires(ct));
return;
case NFT_CT_HELPER:
if (ct->master == NULL)
@@ -198,6 +194,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
}
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_replace(ct,
+ &regs->data[priv->sreg],
+ &regs->data[priv->sreg],
+ NF_CT_LABELS_MAX_SIZE / sizeof(u32));
+ break;
+#endif
default:
break;
}
@@ -365,6 +369,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
len = FIELD_SIZEOF(struct nf_conn, mark);
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+ len = NF_CT_LABELS_MAX_SIZE;
+ err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
+ if (err)
+ return err;
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -384,6 +398,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
static void nft_ct_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
+ struct nft_ct *priv = nft_expr_priv(expr);
+
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_put(ctx->net);
+ break;
+#endif
+ default:
+ break;
+ }
+
nft_ct_l3proto_module_put(ctx->afi->family);
}
@@ -484,6 +510,8 @@ static struct nft_expr_type nft_ct_type __read_mostly = {
static int __init nft_ct_module_init(void)
{
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE);
+
return nft_register_expr(&nft_ct_type);
}
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 9dec3bd1b..78d4914fb 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -227,7 +227,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout)))
+ if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout),
+ NFTA_DYNSET_PAD))
goto nla_put_failure;
if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
goto nla_put_failure;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3f9d45d3d..f39c53a15 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -189,10 +189,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
struct nft_hash_elem *he;
struct rhashtable_iter hti;
struct nft_set_elem elem;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int err;
- err = rhashtable_walk_init(&priv->ht, &hti);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
iter->err = err;
if (err)
return;
@@ -218,7 +217,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
goto cont;
if (nft_set_elem_expired(&he->ext))
goto cont;
- if (!nft_set_elem_active(&he->ext, genmask))
+ if (!nft_set_elem_active(&he->ext, iter->genmask))
goto cont;
elem.priv = he;
@@ -248,7 +247,7 @@ static void nft_hash_gc(struct work_struct *work)
priv = container_of(work, struct nft_hash, gc_work.work);
set = nft_set_container_of(priv);
- err = rhashtable_walk_init(&priv->ht, &hti);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
if (err)
goto schedule;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 99d18578a..070b98938 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -97,8 +97,10 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
u64 rate = limit->rate - limit->burst;
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
- nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate),
+ NFTA_LIMIT_PAD) ||
+ nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs),
+ NFTA_LIMIT_PAD) ||
nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 16c50b0dd..f4bad9dc1 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -227,7 +227,7 @@ void nft_meta_set_eval(const struct nft_expr *expr,
skb->pkt_type = value;
break;
case NFT_META_NFTRACE:
- skb->nf_trace = 1;
+ skb->nf_trace = !!value;
break;
default:
WARN_ON(1);
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index 1c30f41cf..7201d57b5 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -29,6 +29,17 @@ struct nft_rbtree_elem {
struct nft_set_ext ext;
};
+static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
+ (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
+}
+
+static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
+ const struct nft_rbtree_elem *interval)
+{
+ return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+}
static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_set_ext **ext)
@@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_rbtree_elem *rbe, *interval = NULL;
const struct rb_node *parent;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ const void *this;
int d;
spin_lock_bh(&nft_rbtree_lock);
@@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
+ this = nft_set_ext_key(&rbe->ext);
+ d = memcmp(this, key, set->klen);
if (d < 0) {
parent = parent->rb_left;
+ /* In case of adjacent ranges, we always see the high
+ * part of the range in first place, before the low one.
+ * So don't update interval if the keys are equal.
+ */
+ if (interval && nft_rbtree_equal(set, this, interval))
+ continue;
interval = rbe;
} else if (d > 0)
parent = parent->rb_right;
@@ -56,9 +75,7 @@ found:
parent = parent->rb_left;
continue;
}
- if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(&rbe->ext) &
- NFT_SET_ELEM_INTERVAL_END)
+ if (nft_rbtree_interval_end(rbe))
goto out;
spin_unlock_bh(&nft_rbtree_lock);
@@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set,
else if (d > 0)
p = &parent->rb_right;
else {
- if (nft_set_elem_active(&rbe->ext, genmask))
- return -EEXIST;
- p = &parent->rb_left;
+ if (nft_set_elem_active(&rbe->ext, genmask)) {
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(new))
+ p = &parent->rb_left;
+ else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(new))
+ p = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
}
}
rb_link_node(&new->node, parent, p);
@@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
{
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
- struct nft_rbtree_elem *rbe;
+ struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int d;
@@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
parent = parent->rb_left;
continue;
}
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(this)) {
+ parent = parent->rb_left;
+ continue;
+ } else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(this)) {
+ parent = parent->rb_right;
+ continue;
+ }
nft_set_elem_change_active(set, &rbe->ext);
return rbe;
}
@@ -178,7 +211,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
struct nft_rbtree_elem *rbe;
struct nft_set_elem elem;
struct rb_node *node;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
spin_lock_bh(&nft_rbtree_lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
@@ -186,7 +218,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&rbe->ext, genmask))
+ if (!nft_set_elem_active(&rbe->ext, iter->genmask))
goto cont;
elem.priv = rbe;
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index bb9cbeb18..a79af2555 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -18,6 +18,16 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
MODULE_ALIAS("ipt_connlabel");
MODULE_ALIAS("ip6t_connlabel");
+static bool connlabel_match(const struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return false;
+
+ return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits);
+}
+
static bool
connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -33,7 +43,7 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (info->options & XT_CONNLABEL_OP_SET)
return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
- return nf_connlabel_match(ct, info->bit) ^ invert;
+ return connlabel_match(ct, info->bit) ^ invert;
}
static int connlabel_mt_check(const struct xt_mtchk_param *par)
@@ -55,7 +65,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return ret;
}
- ret = nf_connlabels_get(par->net, info->bit + 1);
+ ret = nf_connlabels_get(par->net, info->bit);
if (ret < 0)
nf_ct_l3proto_module_put(par->family);
return ret;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 49d14ecad..b10ade272 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -120,9 +120,9 @@ xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return __inet_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
- in->ifindex);
+ return inet_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
in->ifindex);
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 28cddc85b..1325776da 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -677,7 +677,7 @@ int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
u32 spot = start;
while (rc == 0 && spot <= end) {
- if (((spot & (BITS_PER_LONG - 1)) != 0) &&
+ if (((spot & (BITS_PER_LONG - 1)) == 0) &&
((end - spot) > BITS_PER_LONG)) {
rc = netlbl_catmap_setlong(catmap,
spot,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f48e3b3ae..627f898c0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2346,7 +2346,8 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
{
int err;
- err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
+ err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti,
+ GFP_KERNEL);
if (err) {
iter->link = MAX_LINKS;
return err;
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index fbb7a2b57..61fff4224 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -64,18 +64,26 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev,
return NULL;
}
-int nci_get_conn_info_by_id(struct nci_dev *ndev, u8 id)
+int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type,
+ struct dest_spec_params *params)
{
struct nci_conn_info *conn_info;
list_for_each_entry(conn_info, &ndev->conn_info_list, list) {
- if (conn_info->id == id)
- return conn_info->conn_id;
+ if (conn_info->dest_type == dest_type) {
+ if (!params)
+ return conn_info->conn_id;
+ if (conn_info) {
+ if (params->id == conn_info->dest_params->id &&
+ params->protocol == conn_info->dest_params->protocol)
+ return conn_info->conn_id;
+ }
+ }
}
return -EINVAL;
}
-EXPORT_SYMBOL(nci_get_conn_info_by_id);
+EXPORT_SYMBOL(nci_get_conn_info_by_dest_type_params);
/* ---- NCI requests ---- */
@@ -392,6 +400,83 @@ int nci_core_init(struct nci_dev *ndev)
}
EXPORT_SYMBOL(nci_core_init);
+struct nci_loopback_data {
+ u8 conn_id;
+ struct sk_buff *data;
+};
+
+static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_loopback_data *data = (struct nci_loopback_data *)opt;
+
+ nci_send_data(ndev, data->conn_id, data->data);
+}
+
+static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err)
+{
+ struct nci_dev *ndev = (struct nci_dev *)context;
+ struct nci_conn_info *conn_info;
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id);
+ if (!conn_info) {
+ nci_req_complete(ndev, NCI_STATUS_REJECTED);
+ return;
+ }
+
+ conn_info->rx_skb = skb;
+
+ nci_req_complete(ndev, NCI_STATUS_OK);
+}
+
+int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len,
+ struct sk_buff **resp)
+{
+ int r;
+ struct nci_loopback_data loopback_data;
+ struct nci_conn_info *conn_info;
+ struct sk_buff *skb;
+ int conn_id = nci_get_conn_info_by_dest_type_params(ndev,
+ NCI_DESTINATION_NFCC_LOOPBACK, NULL);
+
+ if (conn_id < 0) {
+ r = nci_core_conn_create(ndev, NCI_DESTINATION_NFCC_LOOPBACK,
+ 0, 0, NULL);
+ if (r != NCI_STATUS_OK)
+ return r;
+
+ conn_id = nci_get_conn_info_by_dest_type_params(ndev,
+ NCI_DESTINATION_NFCC_LOOPBACK,
+ NULL);
+ }
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
+ if (!conn_info)
+ return -EPROTO;
+
+ /* store cb and context to be used on receiving data */
+ conn_info->data_exchange_cb = nci_nfcc_loopback_cb;
+ conn_info->data_exchange_cb_context = ndev;
+
+ skb = nci_skb_alloc(ndev, NCI_DATA_HDR_SIZE + data_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reserve(skb, NCI_DATA_HDR_SIZE);
+ memcpy(skb_put(skb, data_len), data, data_len);
+
+ loopback_data.conn_id = conn_id;
+ loopback_data.data = skb;
+
+ ndev->cur_conn_id = conn_id;
+ r = nci_request(ndev, nci_send_data_req, (unsigned long)&loopback_data,
+ msecs_to_jiffies(NCI_DATA_TIMEOUT));
+ if (r == NCI_STATUS_OK && resp)
+ *resp = conn_info->rx_skb;
+
+ return r;
+}
+EXPORT_SYMBOL(nci_nfcc_loopback);
+
static int nci_open_device(struct nci_dev *ndev)
{
int rc = 0;
@@ -610,9 +695,6 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
struct nci_core_conn_create_cmd *cmd;
struct core_conn_create_data data;
- if (!number_destination_params)
- return -EINVAL;
-
data.length = params_len + sizeof(struct nci_core_conn_create_cmd);
cmd = kzalloc(data.length, GFP_KERNEL);
if (!cmd)
@@ -620,17 +702,23 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
cmd->destination_type = destination_type;
cmd->number_destination_params = number_destination_params;
- memcpy(cmd->params, params, params_len);
data.cmd = cmd;
- if (params->length > 0)
- ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX];
- else
- ndev->cur_id = 0;
+ if (params) {
+ memcpy(cmd->params, params, params_len);
+ if (params->length > 0)
+ memcpy(&ndev->cur_params,
+ &params->value[DEST_SPEC_PARAMS_ID_INDEX],
+ sizeof(struct dest_spec_params));
+ else
+ ndev->cur_params.id = 0;
+ } else {
+ ndev->cur_params.id = 0;
+ }
+ ndev->cur_dest_type = destination_type;
- r = __nci_request(ndev, nci_core_conn_create_req,
- (unsigned long)&data,
+ r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data,
msecs_to_jiffies(NCI_CMD_TIMEOUT));
kfree(cmd);
return r;
@@ -646,6 +734,7 @@ static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt)
int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id)
{
+ ndev->cur_conn_id = conn_id;
return __nci_request(ndev, nci_core_conn_close_req, conn_id,
msecs_to_jiffies(NCI_CMD_TIMEOUT));
}
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index 2ada2b39e..1e8c1a12a 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -734,7 +734,7 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
* “HCI Access”, even if the HCI Network contains multiple NFCEEs.
*/
ndev->hci_dev->nfcee_id = nfcee_ntf->nfcee_id;
- ndev->cur_id = nfcee_ntf->nfcee_id;
+ ndev->cur_params.id = nfcee_ntf->nfcee_id;
nci_req_complete(ndev, status);
}
diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c
index 9b6eb913d..e3bbf1937 100644
--- a/net/nfc/nci/rsp.c
+++ b/net/nfc/nci/rsp.c
@@ -226,7 +226,7 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev,
struct sk_buff *skb)
{
__u8 status = skb->data[0];
- struct nci_conn_info *conn_info;
+ struct nci_conn_info *conn_info = NULL;
struct nci_core_conn_create_rsp *rsp;
pr_debug("status 0x%x\n", status);
@@ -241,7 +241,17 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev,
goto exit;
}
- conn_info->id = ndev->cur_id;
+ conn_info->dest_params = devm_kzalloc(&ndev->nfc_dev->dev,
+ sizeof(struct dest_spec_params),
+ GFP_KERNEL);
+ if (!conn_info->dest_params) {
+ status = NCI_STATUS_REJECTED;
+ goto free_conn_info;
+ }
+
+ conn_info->dest_type = ndev->cur_dest_type;
+ conn_info->dest_params->id = ndev->cur_params.id;
+ conn_info->dest_params->protocol = ndev->cur_params.protocol;
conn_info->conn_id = rsp->conn_id;
/* Note: data_exchange_cb and data_exchange_cb_context need to
@@ -251,7 +261,7 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev,
INIT_LIST_HEAD(&conn_info->list);
list_add(&conn_info->list, &ndev->conn_info_list);
- if (ndev->cur_id == ndev->hci_dev->nfcee_id)
+ if (ndev->cur_params.id == ndev->hci_dev->nfcee_id)
ndev->hci_dev->conn_info = conn_info;
conn_info->conn_id = rsp->conn_id;
@@ -259,7 +269,11 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev,
atomic_set(&conn_info->credits_cnt, rsp->credits_cnt);
}
+free_conn_info:
+ if (status == NCI_STATUS_REJECTED)
+ devm_kfree(&ndev->nfc_dev->dev, conn_info);
exit:
+
nci_req_complete(ndev, status);
}
@@ -271,7 +285,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev,
pr_debug("status 0x%x\n", status);
if (status == NCI_STATUS_OK) {
- conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_id);
+ conn_info = nci_get_conn_info_by_conn_id(ndev,
+ ndev->cur_conn_id);
if (conn_info) {
list_del(&conn_info->list);
devm_kfree(&ndev->nfc_dev->dev, conn_info);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 879185fe1..9a3eb7a0e 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -137,11 +137,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
return !!key->eth.type;
}
+static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~(hdr->h_proto), ethertype };
+
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_mpls *mpls)
{
__be32 *new_mpls_lse;
- struct ethhdr *hdr;
/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
if (skb->encapsulation)
@@ -160,9 +172,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
- hdr = eth_hdr(skb);
- hdr->h_proto = mpls->mpls_ethertype;
-
+ update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
if (!skb->inner_protocol)
skb_set_inner_protocol(skb, skb->protocol);
skb->protocol = mpls->mpls_ethertype;
@@ -193,7 +203,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
* field correctly in the presence of VLAN tags.
*/
hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
- hdr->h_proto = ethertype;
+ update_ethertype(skb, hdr, ethertype);
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 10c84d882..d84312584 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -439,20 +439,12 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
u8 protonum;
l3proto = __nf_ct_l3proto_find(l3num);
- if (!l3proto) {
- pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
- return NULL;
- }
if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
&protonum) <= 0) {
pr_debug("ovs_ct_find_existing: Can't get protonum\n");
return NULL;
}
l4proto = __nf_ct_l4proto_find(l3num, protonum);
- if (!l4proto) {
- pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
- return NULL;
- }
if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
protonum, net, &tuple, l3proto, l4proto)) {
pr_debug("ovs_ct_find_existing: Can't get tuple\n");
@@ -826,8 +818,18 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*/
state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
__ovs_ct_update_key(key, state, &info->zone, exp->master);
- } else
- return __ovs_ct_lookup(net, key, info, skb);
+ } else {
+ struct nf_conn *ct;
+ int err;
+
+ err = __ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ return err;
+
+ ct = (struct nf_conn *)skb->nfct;
+ if (ct)
+ nf_ct_deliver_cached_events(ct);
+ }
return 0;
}
@@ -1358,7 +1360,7 @@ void ovs_ct_init(struct net *net)
unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
- if (nf_connlabels_get(net, n_bits)) {
+ if (nf_connlabels_get(net, n_bits - 1)) {
ovs_net->xt_label = false;
OVS_NLERR(true, "Failed to set connlabel length");
} else {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0cc66a4e4..856bd8dba 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -738,9 +738,9 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
len += nla_total_size(acts->orig_len);
return len
- + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
- + nla_total_size(8); /* OVS_FLOW_ATTR_USED */
+ + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */
}
/* Called with ovs_mutex or RCU read lock. */
@@ -754,11 +754,14 @@ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
if (used &&
- nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
+ nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used),
+ OVS_FLOW_ATTR_PAD))
return -EMSGSIZE;
if (stats.n_packets &&
- nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
+ nla_put_64bit(skb, OVS_FLOW_ATTR_STATS,
+ sizeof(struct ovs_flow_stats), &stats,
+ OVS_FLOW_ATTR_PAD))
return -EMSGSIZE;
if ((u8)ntohs(tcp_flags) &&
@@ -1434,8 +1437,8 @@ static size_t ovs_dp_cmd_msg_size(void)
size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
msgsize += nla_total_size(IFNAMSIZ);
- msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
- msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
+ msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats));
+ msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats));
msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
return msgsize;
@@ -1462,13 +1465,13 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
goto nla_put_failure;
get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
- if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
- &dp_stats))
+ if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
+ &dp_stats, OVS_DP_ATTR_PAD))
goto nla_put_failure;
- if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
- sizeof(struct ovs_dp_megaflow_stats),
- &dp_megaflow_stats))
+ if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
+ sizeof(struct ovs_dp_megaflow_stats),
+ &dp_megaflow_stats, OVS_DP_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
@@ -1837,8 +1840,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
- if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
- &vport_stats))
+ if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS,
+ sizeof(struct ovs_vport_stats), &vport_stats,
+ OVS_VPORT_ATTR_PAD))
goto nla_put_failure;
if (ovs_vport_get_upcall_portids(vport, skb))
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 689c17264..0bb650f4f 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -261,7 +261,7 @@ size_t ovs_tun_key_attr_size(void)
/* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider
* updating this function.
*/
- return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */
+ return nla_total_size_64bit(8) /* OVS_TUNNEL_KEY_ATTR_ID */
+ nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */
+ nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */
@@ -720,7 +720,8 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
unsigned short tun_proto)
{
if (output->tun_flags & TUNNEL_KEY &&
- nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
+ nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id,
+ OVS_TUNNEL_KEY_ATTR_PAD))
return -EMSGSIZE;
switch (tun_proto) {
case AF_INET:
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 7c8b90bf0..2ee48e447 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -165,11 +165,10 @@ static void do_setup(struct net_device *netdev)
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
- IFF_PHONY_HEADROOM;
+ IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
netdev->destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
- netdev->tx_queue_len = 0;
netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8012f67ca..b43c4015b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -93,6 +93,7 @@
#include <net/inet_common.h>
#endif
#include <linux/bpf.h>
+#include <net/compat.h>
#include "internal.h"
@@ -1837,6 +1838,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
struct sk_buff *skb = NULL;
struct net_device *dev;
+ struct sockcm_cookie sockc;
__be16 proto = 0;
int err;
int extra_len = 0;
@@ -1925,12 +1927,19 @@ retry:
goto out_unlock;
}
+ sockc.tsflags = sk->sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err))
+ goto out_unlock;
+ }
+
skb->protocol = proto;
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
@@ -2042,6 +2051,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
u8 *skb_head = skb->data;
int skb_len = skb->len;
unsigned int snaplen, res;
+ bool is_drop_n_account = false;
if (skb->pkt_type == PACKET_LOOPBACK)
goto drop;
@@ -2130,6 +2140,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
return 0;
drop_n_acct:
+ is_drop_n_account = true;
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_drops++;
atomic_inc(&sk->sk_drops);
@@ -2141,7 +2152,10 @@ drop_n_restore:
skb->len = skb_len;
}
drop:
- consume_skb(skb);
+ if (!is_drop_n_account)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
return 0;
}
@@ -2160,6 +2174,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct sk_buff *copy_skb = NULL;
struct timespec ts;
__u32 ts_status;
+ bool is_drop_n_account = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
* We may add members to them until current aligned size without forcing
@@ -2367,10 +2382,14 @@ drop_n_restore:
skb->len = skb_len;
}
drop:
- kfree_skb(skb);
+ if (!is_drop_n_account)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
return 0;
drop_n_account:
+ is_drop_n_account = true;
po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
@@ -2486,7 +2505,8 @@ static int packet_snd_vnet_gso(struct sk_buff *skb,
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
void *frame, struct net_device *dev, void *data, int tp_len,
- __be16 proto, unsigned char *addr, int hlen, int copylen)
+ __be16 proto, unsigned char *addr, int hlen, int copylen,
+ const struct sockcm_cookie *sockc)
{
union tpacket_uhdr ph;
int to_write, offset, len, nr_frags, len_max;
@@ -2500,7 +2520,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb->dev = dev;
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
- sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
skb_shinfo(skb)->destructor_arg = ph.raw;
skb_reserve(skb, hlen);
@@ -2624,6 +2644,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
struct sk_buff *skb;
struct net_device *dev;
struct virtio_net_hdr *vnet_hdr = NULL;
+ struct sockcm_cookie sockc;
__be16 proto;
int err, reserve = 0;
void *ph;
@@ -2655,6 +2676,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
}
+ sockc.tsflags = po->sk.sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(&po->sk, msg, &sockc);
+ if (unlikely(err))
+ goto out;
+ }
+
err = -ENXIO;
if (unlikely(dev == NULL))
goto out;
@@ -2712,7 +2740,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
goto out_status;
}
tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
- addr, hlen, copylen);
+ addr, hlen, copylen, &sockc);
if (likely(tp_len >= 0) &&
tp_len > dev->mtu + reserve &&
!po->has_vnet_hdr &&
@@ -2851,6 +2879,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_unlock;
+ sockc.tsflags = sk->sk_tsflags;
sockc.mark = sk->sk_mark;
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
@@ -2908,7 +2937,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
goto out_free;
}
- sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
!packet_extra_vlan_len_allowed(dev, skb)) {
@@ -3910,6 +3939,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
}
+#ifdef CONFIG_COMPAT
+static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct packet_sock *po = pkt_sk(sock->sk);
+
+ if (level != SOL_PACKET)
+ return -ENOPROTOOPT;
+
+ if (optname == PACKET_FANOUT_DATA &&
+ po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
+ optval = (char __user *)get_compat_bpf_fprog(optval);
+ if (!optval)
+ return -EFAULT;
+ optlen = sizeof(struct sock_fprog);
+ }
+
+ return packet_setsockopt(sock, level, optname, optval, optlen);
+}
+#endif
+
static int packet_notifier(struct notifier_block *this,
unsigned long msg, void *ptr)
{
@@ -4386,6 +4436,9 @@ static const struct proto_ops packet_ops = {
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
.getsockopt = packet_getsockopt,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_packet_setsockopt,
+#endif
.sendmsg = packet_sendmsg,
.recvmsg = packet_recvmsg,
.mmap = packet_mmap,
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
new file mode 100644
index 000000000..b83c6807a
--- /dev/null
+++ b/net/qrtr/Kconfig
@@ -0,0 +1,24 @@
+# Qualcomm IPC Router configuration
+#
+
+config QRTR
+ tristate "Qualcomm IPC Router support"
+ depends on ARCH_QCOM || COMPILE_TEST
+ ---help---
+ Say Y if you intend to use Qualcomm IPC router protocol. The
+ protocol is used to communicate with services provided by other
+ hardware blocks in the system.
+
+ In order to do service lookups, a userspace daemon is required to
+ maintain a service listing.
+
+if QRTR
+
+config QRTR_SMD
+ tristate "SMD IPC Router channels"
+ depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n)
+ ---help---
+ Say Y here to support SMD based ipcrouter channels. SMD is the
+ most common transport for IPC Router.
+
+endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
new file mode 100644
index 000000000..ab09e40f7
--- /dev/null
+++ b/net/qrtr/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_QRTR) := qrtr.o
+
+obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
+qrtr-smd-y := smd.o
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
new file mode 100644
index 000000000..c985ecbe9
--- /dev/null
+++ b/net/qrtr/qrtr.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/qrtr.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+
+#include <net/sock.h>
+
+#include "qrtr.h"
+
+#define QRTR_PROTO_VER 1
+
+/* auto-bind range */
+#define QRTR_MIN_EPH_SOCKET 0x4000
+#define QRTR_MAX_EPH_SOCKET 0x7fff
+
+enum qrtr_pkt_type {
+ QRTR_TYPE_DATA = 1,
+ QRTR_TYPE_HELLO = 2,
+ QRTR_TYPE_BYE = 3,
+ QRTR_TYPE_NEW_SERVER = 4,
+ QRTR_TYPE_DEL_SERVER = 5,
+ QRTR_TYPE_DEL_CLIENT = 6,
+ QRTR_TYPE_RESUME_TX = 7,
+ QRTR_TYPE_EXIT = 8,
+ QRTR_TYPE_PING = 9,
+};
+
+/**
+ * struct qrtr_hdr - (I|R)PCrouter packet header
+ * @version: protocol version
+ * @type: packet type; one of QRTR_TYPE_*
+ * @src_node_id: source node
+ * @src_port_id: source port
+ * @confirm_rx: boolean; whether a resume-tx packet should be send in reply
+ * @size: length of packet, excluding this header
+ * @dst_node_id: destination node
+ * @dst_port_id: destination port
+ */
+struct qrtr_hdr {
+ __le32 version;
+ __le32 type;
+ __le32 src_node_id;
+ __le32 src_port_id;
+ __le32 confirm_rx;
+ __le32 size;
+ __le32 dst_node_id;
+ __le32 dst_port_id;
+} __packed;
+
+#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr)
+#define QRTR_NODE_BCAST ((unsigned int)-1)
+#define QRTR_PORT_CTRL ((unsigned int)-2)
+
+struct qrtr_sock {
+ /* WARNING: sk must be the first member */
+ struct sock sk;
+ struct sockaddr_qrtr us;
+ struct sockaddr_qrtr peer;
+};
+
+static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
+{
+ BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0);
+ return container_of(sk, struct qrtr_sock, sk);
+}
+
+static unsigned int qrtr_local_nid = -1;
+
+/* for node ids */
+static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
+/* broadcast list */
+static LIST_HEAD(qrtr_all_nodes);
+/* lock for qrtr_nodes, qrtr_all_nodes and node reference */
+static DEFINE_MUTEX(qrtr_node_lock);
+
+/* local port allocation management */
+static DEFINE_IDR(qrtr_ports);
+static DEFINE_MUTEX(qrtr_port_lock);
+
+/**
+ * struct qrtr_node - endpoint node
+ * @ep_lock: lock for endpoint management and callbacks
+ * @ep: endpoint
+ * @ref: reference count for node
+ * @nid: node id
+ * @rx_queue: receive queue
+ * @work: scheduled work struct for recv work
+ * @item: list item for broadcast list
+ */
+struct qrtr_node {
+ struct mutex ep_lock;
+ struct qrtr_endpoint *ep;
+ struct kref ref;
+ unsigned int nid;
+
+ struct sk_buff_head rx_queue;
+ struct work_struct work;
+ struct list_head item;
+};
+
+/* Release node resources and free the node.
+ *
+ * Do not call directly, use qrtr_node_release. To be used with
+ * kref_put_mutex. As such, the node mutex is expected to be locked on call.
+ */
+static void __qrtr_node_release(struct kref *kref)
+{
+ struct qrtr_node *node = container_of(kref, struct qrtr_node, ref);
+
+ if (node->nid != QRTR_EP_NID_AUTO)
+ radix_tree_delete(&qrtr_nodes, node->nid);
+
+ list_del(&node->item);
+ mutex_unlock(&qrtr_node_lock);
+
+ skb_queue_purge(&node->rx_queue);
+ kfree(node);
+}
+
+/* Increment reference to node. */
+static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node)
+{
+ if (node)
+ kref_get(&node->ref);
+ return node;
+}
+
+/* Decrement reference to node and release as necessary. */
+static void qrtr_node_release(struct qrtr_node *node)
+{
+ if (!node)
+ return;
+ kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock);
+}
+
+/* Pass an outgoing packet socket buffer to the endpoint driver. */
+static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+ int rc = -ENODEV;
+
+ mutex_lock(&node->ep_lock);
+ if (node->ep)
+ rc = node->ep->xmit(node->ep, skb);
+ else
+ kfree_skb(skb);
+ mutex_unlock(&node->ep_lock);
+
+ return rc;
+}
+
+/* Lookup node by id.
+ *
+ * callers must release with qrtr_node_release()
+ */
+static struct qrtr_node *qrtr_node_lookup(unsigned int nid)
+{
+ struct qrtr_node *node;
+
+ mutex_lock(&qrtr_node_lock);
+ node = radix_tree_lookup(&qrtr_nodes, nid);
+ node = qrtr_node_acquire(node);
+ mutex_unlock(&qrtr_node_lock);
+
+ return node;
+}
+
+/* Assign node id to node.
+ *
+ * This is mostly useful for automatic node id assignment, based on
+ * the source id in the incoming packet.
+ */
+static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid)
+{
+ if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO)
+ return;
+
+ mutex_lock(&qrtr_node_lock);
+ radix_tree_insert(&qrtr_nodes, nid, node);
+ node->nid = nid;
+ mutex_unlock(&qrtr_node_lock);
+}
+
+/**
+ * qrtr_endpoint_post() - post incoming data
+ * @ep: endpoint handle
+ * @data: data pointer
+ * @len: size of data in bytes
+ *
+ * Return: 0 on success; negative error code on failure
+ */
+int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
+{
+ struct qrtr_node *node = ep->node;
+ const struct qrtr_hdr *phdr = data;
+ struct sk_buff *skb;
+ unsigned int psize;
+ unsigned int size;
+ unsigned int type;
+ unsigned int ver;
+ unsigned int dst;
+
+ if (len < QRTR_HDR_SIZE || len & 3)
+ return -EINVAL;
+
+ ver = le32_to_cpu(phdr->version);
+ size = le32_to_cpu(phdr->size);
+ type = le32_to_cpu(phdr->type);
+ dst = le32_to_cpu(phdr->dst_port_id);
+
+ psize = (size + 3) & ~3;
+
+ if (ver != QRTR_PROTO_VER)
+ return -EINVAL;
+
+ if (len != psize + QRTR_HDR_SIZE)
+ return -EINVAL;
+
+ if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA)
+ return -EINVAL;
+
+ skb = netdev_alloc_skb(NULL, len);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reset_transport_header(skb);
+ memcpy(skb_put(skb, len), data, len);
+
+ skb_queue_tail(&node->rx_queue, skb);
+ schedule_work(&node->work);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_post);
+
+/* Allocate and construct a resume-tx packet. */
+static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
+ u32 dst_node, u32 port)
+{
+ const int pkt_len = 20;
+ struct qrtr_hdr *hdr;
+ struct sk_buff *skb;
+ u32 *buf;
+
+ skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+ skb_reset_transport_header(skb);
+
+ hdr = (struct qrtr_hdr *)skb_put(skb, QRTR_HDR_SIZE);
+ hdr->version = cpu_to_le32(QRTR_PROTO_VER);
+ hdr->type = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+ hdr->src_node_id = cpu_to_le32(src_node);
+ hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL);
+ hdr->confirm_rx = cpu_to_le32(0);
+ hdr->size = cpu_to_le32(pkt_len);
+ hdr->dst_node_id = cpu_to_le32(dst_node);
+ hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
+
+ buf = (u32 *)skb_put(skb, pkt_len);
+ memset(buf, 0, pkt_len);
+ buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+ buf[1] = cpu_to_le32(src_node);
+ buf[2] = cpu_to_le32(port);
+
+ return skb;
+}
+
+static struct qrtr_sock *qrtr_port_lookup(int port);
+static void qrtr_port_put(struct qrtr_sock *ipc);
+
+/* Handle and route a received packet.
+ *
+ * This will auto-reply with resume-tx packet as necessary.
+ */
+static void qrtr_node_rx_work(struct work_struct *work)
+{
+ struct qrtr_node *node = container_of(work, struct qrtr_node, work);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&node->rx_queue)) != NULL) {
+ const struct qrtr_hdr *phdr;
+ u32 dst_node, dst_port;
+ struct qrtr_sock *ipc;
+ u32 src_node;
+ int confirm;
+
+ phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+ src_node = le32_to_cpu(phdr->src_node_id);
+ dst_node = le32_to_cpu(phdr->dst_node_id);
+ dst_port = le32_to_cpu(phdr->dst_port_id);
+ confirm = !!phdr->confirm_rx;
+
+ qrtr_node_assign(node, src_node);
+
+ ipc = qrtr_port_lookup(dst_port);
+ if (!ipc) {
+ kfree_skb(skb);
+ } else {
+ if (sock_queue_rcv_skb(&ipc->sk, skb))
+ kfree_skb(skb);
+
+ qrtr_port_put(ipc);
+ }
+
+ if (confirm) {
+ skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port);
+ if (!skb)
+ break;
+ if (qrtr_node_enqueue(node, skb))
+ break;
+ }
+ }
+}
+
+/**
+ * qrtr_endpoint_register() - register a new endpoint
+ * @ep: endpoint to register
+ * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment
+ * Return: 0 on success; negative error code on failure
+ *
+ * The specified endpoint must have the xmit function pointer set on call.
+ */
+int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid)
+{
+ struct qrtr_node *node;
+
+ if (!ep || !ep->xmit)
+ return -EINVAL;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ INIT_WORK(&node->work, qrtr_node_rx_work);
+ kref_init(&node->ref);
+ mutex_init(&node->ep_lock);
+ skb_queue_head_init(&node->rx_queue);
+ node->nid = QRTR_EP_NID_AUTO;
+ node->ep = ep;
+
+ qrtr_node_assign(node, nid);
+
+ mutex_lock(&qrtr_node_lock);
+ list_add(&node->item, &qrtr_all_nodes);
+ mutex_unlock(&qrtr_node_lock);
+ ep->node = node;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_register);
+
+/**
+ * qrtr_endpoint_unregister - unregister endpoint
+ * @ep: endpoint to unregister
+ */
+void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
+{
+ struct qrtr_node *node = ep->node;
+
+ mutex_lock(&node->ep_lock);
+ node->ep = NULL;
+ mutex_unlock(&node->ep_lock);
+
+ qrtr_node_release(node);
+ ep->node = NULL;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister);
+
+/* Lookup socket by port.
+ *
+ * Callers must release with qrtr_port_put()
+ */
+static struct qrtr_sock *qrtr_port_lookup(int port)
+{
+ struct qrtr_sock *ipc;
+
+ if (port == QRTR_PORT_CTRL)
+ port = 0;
+
+ mutex_lock(&qrtr_port_lock);
+ ipc = idr_find(&qrtr_ports, port);
+ if (ipc)
+ sock_hold(&ipc->sk);
+ mutex_unlock(&qrtr_port_lock);
+
+ return ipc;
+}
+
+/* Release acquired socket. */
+static void qrtr_port_put(struct qrtr_sock *ipc)
+{
+ sock_put(&ipc->sk);
+}
+
+/* Remove port assignment. */
+static void qrtr_port_remove(struct qrtr_sock *ipc)
+{
+ int port = ipc->us.sq_port;
+
+ if (port == QRTR_PORT_CTRL)
+ port = 0;
+
+ __sock_put(&ipc->sk);
+
+ mutex_lock(&qrtr_port_lock);
+ idr_remove(&qrtr_ports, port);
+ mutex_unlock(&qrtr_port_lock);
+}
+
+/* Assign port number to socket.
+ *
+ * Specify port in the integer pointed to by port, and it will be adjusted
+ * on return as necesssary.
+ *
+ * Port may be:
+ * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET]
+ * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN
+ * >QRTR_MIN_EPH_SOCKET: Specified; available to all
+ */
+static int qrtr_port_assign(struct qrtr_sock *ipc, int *port)
+{
+ int rc;
+
+ mutex_lock(&qrtr_port_lock);
+ if (!*port) {
+ rc = idr_alloc(&qrtr_ports, ipc,
+ QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET + 1,
+ GFP_ATOMIC);
+ if (rc >= 0)
+ *port = rc;
+ } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) {
+ rc = -EACCES;
+ } else if (*port == QRTR_PORT_CTRL) {
+ rc = idr_alloc(&qrtr_ports, ipc, 0, 1, GFP_ATOMIC);
+ } else {
+ rc = idr_alloc(&qrtr_ports, ipc, *port, *port + 1, GFP_ATOMIC);
+ if (rc >= 0)
+ *port = rc;
+ }
+ mutex_unlock(&qrtr_port_lock);
+
+ if (rc == -ENOSPC)
+ return -EADDRINUSE;
+ else if (rc < 0)
+ return rc;
+
+ sock_hold(&ipc->sk);
+
+ return 0;
+}
+
+/* Bind socket to address.
+ *
+ * Socket should be locked upon call.
+ */
+static int __qrtr_bind(struct socket *sock,
+ const struct sockaddr_qrtr *addr, int zapped)
+{
+ struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ int port;
+ int rc;
+
+ /* rebinding ok */
+ if (!zapped && addr->sq_port == ipc->us.sq_port)
+ return 0;
+
+ port = addr->sq_port;
+ rc = qrtr_port_assign(ipc, &port);
+ if (rc)
+ return rc;
+
+ /* unbind previous, if any */
+ if (!zapped)
+ qrtr_port_remove(ipc);
+ ipc->us.sq_port = port;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ return 0;
+}
+
+/* Auto bind to an ephemeral port. */
+static int qrtr_autobind(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_qrtr addr;
+
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ return 0;
+
+ addr.sq_family = AF_QIPCRTR;
+ addr.sq_node = qrtr_local_nid;
+ addr.sq_port = 0;
+
+ return __qrtr_bind(sock, &addr, 1);
+}
+
+/* Bind socket to specified sockaddr. */
+static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+ DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
+ struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ int rc;
+
+ if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR)
+ return -EINVAL;
+
+ if (addr->sq_node != ipc->us.sq_node)
+ return -EINVAL;
+
+ lock_sock(sk);
+ rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED));
+ release_sock(sk);
+
+ return rc;
+}
+
+/* Queue packet to local peer socket. */
+static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+ const struct qrtr_hdr *phdr;
+ struct qrtr_sock *ipc;
+
+ phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+
+ ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id));
+ if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ if (sock_queue_rcv_skb(&ipc->sk, skb)) {
+ qrtr_port_put(ipc);
+ kfree_skb(skb);
+ return -ENOSPC;
+ }
+
+ qrtr_port_put(ipc);
+
+ return 0;
+}
+
+/* Queue packet for broadcast. */
+static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+ struct sk_buff *skbn;
+
+ mutex_lock(&qrtr_node_lock);
+ list_for_each_entry(node, &qrtr_all_nodes, item) {
+ skbn = skb_clone(skb, GFP_KERNEL);
+ if (!skbn)
+ break;
+ skb_set_owner_w(skbn, skb->sk);
+ qrtr_node_enqueue(node, skbn);
+ }
+ mutex_unlock(&qrtr_node_lock);
+
+ qrtr_local_enqueue(node, skb);
+
+ return 0;
+}
+
+static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name);
+ int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *);
+ struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ struct qrtr_node *node;
+ struct qrtr_hdr *hdr;
+ struct sk_buff *skb;
+ size_t plen;
+ int rc;
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT))
+ return -EINVAL;
+
+ if (len > 65535)
+ return -EMSGSIZE;
+
+ lock_sock(sk);
+
+ if (addr) {
+ if (msg->msg_namelen < sizeof(*addr)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ if (addr->sq_family != AF_QIPCRTR) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ rc = qrtr_autobind(sock);
+ if (rc) {
+ release_sock(sk);
+ return rc;
+ }
+ } else if (sk->sk_state == TCP_ESTABLISHED) {
+ addr = &ipc->peer;
+ } else {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ node = NULL;
+ if (addr->sq_node == QRTR_NODE_BCAST) {
+ enqueue_fn = qrtr_bcast_enqueue;
+ } else if (addr->sq_node == ipc->us.sq_node) {
+ enqueue_fn = qrtr_local_enqueue;
+ } else {
+ enqueue_fn = qrtr_node_enqueue;
+ node = qrtr_node_lookup(addr->sq_node);
+ if (!node) {
+ release_sock(sk);
+ return -ECONNRESET;
+ }
+ }
+
+ plen = (len + 3) & ~3;
+ skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE,
+ msg->msg_flags & MSG_DONTWAIT, &rc);
+ if (!skb)
+ goto out_node;
+
+ skb_reset_transport_header(skb);
+ skb_put(skb, len + QRTR_HDR_SIZE);
+
+ hdr = (struct qrtr_hdr *)skb_transport_header(skb);
+ hdr->version = cpu_to_le32(QRTR_PROTO_VER);
+ hdr->src_node_id = cpu_to_le32(ipc->us.sq_node);
+ hdr->src_port_id = cpu_to_le32(ipc->us.sq_port);
+ hdr->confirm_rx = cpu_to_le32(0);
+ hdr->size = cpu_to_le32(len);
+ hdr->dst_node_id = cpu_to_le32(addr->sq_node);
+ hdr->dst_port_id = cpu_to_le32(addr->sq_port);
+
+ rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE,
+ &msg->msg_iter, len);
+ if (rc) {
+ kfree_skb(skb);
+ goto out_node;
+ }
+
+ if (plen != len) {
+ skb_pad(skb, plen - len);
+ skb_put(skb, plen - len);
+ }
+
+ if (ipc->us.sq_port == QRTR_PORT_CTRL) {
+ if (len < 4) {
+ rc = -EINVAL;
+ kfree_skb(skb);
+ goto out_node;
+ }
+
+ /* control messages already require the type as 'command' */
+ skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4);
+ } else {
+ hdr->type = cpu_to_le32(QRTR_TYPE_DATA);
+ }
+
+ rc = enqueue_fn(node, skb);
+ if (rc >= 0)
+ rc = len;
+
+out_node:
+ qrtr_node_release(node);
+ release_sock(sk);
+
+ return rc;
+}
+
+static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name);
+ const struct qrtr_hdr *phdr;
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int copied, rc;
+
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ release_sock(sk);
+ return -EADDRNOTAVAIL;
+ }
+
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &rc);
+ if (!skb) {
+ release_sock(sk);
+ return rc;
+ }
+
+ phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+ copied = le32_to_cpu(phdr->size);
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied);
+ if (rc < 0)
+ goto out;
+ rc = copied;
+
+ if (addr) {
+ addr->sq_family = AF_QIPCRTR;
+ addr->sq_node = le32_to_cpu(phdr->src_node_id);
+ addr->sq_port = le32_to_cpu(phdr->src_port_id);
+ msg->msg_namelen = sizeof(*addr);
+ }
+
+out:
+ skb_free_datagram(sk, skb);
+ release_sock(sk);
+
+ return rc;
+}
+
+static int qrtr_connect(struct socket *sock, struct sockaddr *saddr,
+ int len, int flags)
+{
+ DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
+ struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ int rc;
+
+ if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ rc = qrtr_autobind(sock);
+ if (rc) {
+ release_sock(sk);
+ return rc;
+ }
+
+ ipc->peer = *addr;
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+
+ release_sock(sk);
+
+ return 0;
+}
+
+static int qrtr_getname(struct socket *sock, struct sockaddr *saddr,
+ int *len, int peer)
+{
+ struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+ struct sockaddr_qrtr qaddr;
+ struct sock *sk = sock->sk;
+
+ lock_sock(sk);
+ if (peer) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ qaddr = ipc->peer;
+ } else {
+ qaddr = ipc->us;
+ }
+ release_sock(sk);
+
+ *len = sizeof(qaddr);
+ qaddr.sq_family = AF_QIPCRTR;
+
+ memcpy(saddr, &qaddr, sizeof(qaddr));
+
+ return 0;
+}
+
+static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ struct sockaddr_qrtr *sq;
+ struct sk_buff *skb;
+ struct ifreq ifr;
+ long len = 0;
+ int rc = 0;
+
+ lock_sock(sk);
+
+ switch (cmd) {
+ case TIOCOUTQ:
+ len = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (len < 0)
+ len = 0;
+ rc = put_user(len, (int __user *)argp);
+ break;
+ case TIOCINQ:
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ len = skb->len - QRTR_HDR_SIZE;
+ rc = put_user(len, (int __user *)argp);
+ break;
+ case SIOCGIFADDR:
+ if (copy_from_user(&ifr, argp, sizeof(ifr))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ sq = (struct sockaddr_qrtr *)&ifr.ifr_addr;
+ *sq = ipc->us;
+ if (copy_to_user(argp, &ifr, sizeof(ifr))) {
+ rc = -EFAULT;
+ break;
+ }
+ break;
+ case SIOCGSTAMP:
+ rc = sock_get_timestamp(sk, argp);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ rc = -EINVAL;
+ break;
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+
+ release_sock(sk);
+
+ return rc;
+}
+
+static int qrtr_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct qrtr_sock *ipc;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+
+ ipc = qrtr_sk(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+
+ sock_set_flag(sk, SOCK_DEAD);
+ sock->sk = NULL;
+
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ qrtr_port_remove(ipc);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static const struct proto_ops qrtr_proto_ops = {
+ .owner = THIS_MODULE,
+ .family = AF_QIPCRTR,
+ .bind = qrtr_bind,
+ .connect = qrtr_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .listen = sock_no_listen,
+ .sendmsg = qrtr_sendmsg,
+ .recvmsg = qrtr_recvmsg,
+ .getname = qrtr_getname,
+ .ioctl = qrtr_ioctl,
+ .poll = datagram_poll,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .release = qrtr_release,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct proto qrtr_proto = {
+ .name = "QIPCRTR",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct qrtr_sock),
+};
+
+static int qrtr_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct qrtr_sock *ipc;
+ struct sock *sk;
+
+ if (sock->type != SOCK_DGRAM)
+ return -EPROTOTYPE;
+
+ sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ sock_init_data(sock, sk);
+ sock->ops = &qrtr_proto_ops;
+
+ ipc = qrtr_sk(sk);
+ ipc->us.sq_family = AF_QIPCRTR;
+ ipc->us.sq_node = qrtr_local_nid;
+ ipc->us.sq_port = 0;
+
+ return 0;
+}
+
+static const struct nla_policy qrtr_policy[IFA_MAX + 1] = {
+ [IFA_LOCAL] = { .type = NLA_U32 },
+};
+
+static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[IFA_MAX + 1];
+ struct ifaddrmsg *ifm;
+ int rc;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ASSERT_RTNL();
+
+ rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy);
+ if (rc < 0)
+ return rc;
+
+ ifm = nlmsg_data(nlh);
+ if (!tb[IFA_LOCAL])
+ return -EINVAL;
+
+ qrtr_local_nid = nla_get_u32(tb[IFA_LOCAL]);
+ return 0;
+}
+
+static const struct net_proto_family qrtr_family = {
+ .owner = THIS_MODULE,
+ .family = AF_QIPCRTR,
+ .create = qrtr_create,
+};
+
+static int __init qrtr_proto_init(void)
+{
+ int rc;
+
+ rc = proto_register(&qrtr_proto, 1);
+ if (rc)
+ return rc;
+
+ rc = sock_register(&qrtr_family);
+ if (rc) {
+ proto_unregister(&qrtr_proto);
+ return rc;
+ }
+
+ rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, NULL);
+
+ return 0;
+}
+module_init(qrtr_proto_init);
+
+static void __exit qrtr_proto_fini(void)
+{
+ rtnl_unregister(PF_QIPCRTR, RTM_NEWADDR);
+ sock_unregister(qrtr_family.family);
+ proto_unregister(&qrtr_proto);
+}
+module_exit(qrtr_proto_fini);
+
+MODULE_DESCRIPTION("Qualcomm IPC-router driver");
+MODULE_LICENSE("GPL v2");
diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h
new file mode 100644
index 000000000..2b848718f
--- /dev/null
+++ b/net/qrtr/qrtr.h
@@ -0,0 +1,31 @@
+#ifndef __QRTR_H_
+#define __QRTR_H_
+
+#include <linux/types.h>
+
+struct sk_buff;
+
+/* endpoint node id auto assignment */
+#define QRTR_EP_NID_AUTO (-1)
+
+/**
+ * struct qrtr_endpoint - endpoint handle
+ * @xmit: Callback for outgoing packets
+ *
+ * The socket buffer passed to the xmit function becomes owned by the endpoint
+ * driver. As such, when the driver is done with the buffer, it should
+ * call kfree_skb() on failure, or consume_skb() on success.
+ */
+struct qrtr_endpoint {
+ int (*xmit)(struct qrtr_endpoint *ep, struct sk_buff *skb);
+ /* private: not for endpoint use */
+ struct qrtr_node *node;
+};
+
+int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid);
+
+void qrtr_endpoint_unregister(struct qrtr_endpoint *ep);
+
+int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len);
+
+#endif
diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c
new file mode 100644
index 000000000..0d11132b3
--- /dev/null
+++ b/net/qrtr/smd.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/soc/qcom/smd.h>
+
+#include "qrtr.h"
+
+struct qrtr_smd_dev {
+ struct qrtr_endpoint ep;
+ struct qcom_smd_channel *channel;
+ struct device *dev;
+};
+
+/* from smd to qrtr */
+static int qcom_smd_qrtr_callback(struct qcom_smd_channel *channel,
+ const void *data, size_t len)
+{
+ struct qrtr_smd_dev *qdev = qcom_smd_get_drvdata(channel);
+ int rc;
+
+ if (!qdev)
+ return -EAGAIN;
+
+ rc = qrtr_endpoint_post(&qdev->ep, data, len);
+ if (rc == -EINVAL) {
+ dev_err(qdev->dev, "invalid ipcrouter packet\n");
+ /* return 0 to let smd drop the packet */
+ rc = 0;
+ }
+
+ return rc;
+}
+
+/* from qrtr to smd */
+static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+ struct qrtr_smd_dev *qdev = container_of(ep, struct qrtr_smd_dev, ep);
+ int rc;
+
+ rc = skb_linearize(skb);
+ if (rc)
+ goto out;
+
+ rc = qcom_smd_send(qdev->channel, skb->data, skb->len);
+
+out:
+ if (rc)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ return rc;
+}
+
+static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev)
+{
+ struct qrtr_smd_dev *qdev;
+ int rc;
+
+ qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL);
+ if (!qdev)
+ return -ENOMEM;
+
+ qdev->channel = sdev->channel;
+ qdev->dev = &sdev->dev;
+ qdev->ep.xmit = qcom_smd_qrtr_send;
+
+ rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO);
+ if (rc)
+ return rc;
+
+ qcom_smd_set_drvdata(sdev->channel, qdev);
+ dev_set_drvdata(&sdev->dev, qdev);
+
+ dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n");
+
+ return 0;
+}
+
+static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev)
+{
+ struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev);
+
+ qrtr_endpoint_unregister(&qdev->ep);
+
+ dev_set_drvdata(&sdev->dev, NULL);
+}
+
+static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = {
+ { "IPCRTR" },
+ {}
+};
+
+static struct qcom_smd_driver qcom_smd_qrtr_driver = {
+ .probe = qcom_smd_qrtr_probe,
+ .remove = qcom_smd_qrtr_remove,
+ .callback = qcom_smd_qrtr_callback,
+ .smd_match_table = qcom_smd_qrtr_smd_match,
+ .driver = {
+ .name = "qcom_smd_qrtr",
+ .owner = THIS_MODULE,
+ },
+};
+
+module_qcom_smd_driver(qcom_smd_qrtr_driver);
+
+MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver");
+MODULE_LICENSE("GPL v2");
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 310cabce2..7c2a65a6a 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -111,7 +111,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
}
- if (conn->c_version < RDS_PROTOCOL(3,1)) {
+ if (conn->c_version < RDS_PROTOCOL(3, 1)) {
printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
" no longer supported\n",
&conn->c_faddr,
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 93ff038ea..d921adc62 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -111,7 +111,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
cpu_relax();
}
- ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
if (unlikely(ret != ibmr->sg_len))
return ret < 0 ? ret : -EINVAL;
diff --git a/net/rds/loop.c b/net/rds/loop.c
index 6b12b6854..814173b46 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -95,8 +95,9 @@ out:
*/
static void rds_loop_inc_free(struct rds_incoming *inc)
{
- struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
- rds_message_put(rm);
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+
+ rds_message_put(rm);
}
/* we need to at least give the thread something to succeed */
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 80256b08e..387df5f32 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -74,6 +74,7 @@ enum {
RDS_CONN_CONNECTING,
RDS_CONN_DISCONNECTING,
RDS_CONN_UP,
+ RDS_CONN_RESETTING,
RDS_CONN_ERROR,
};
@@ -813,6 +814,7 @@ void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
+void rds_connect_path_complete(struct rds_connection *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c0be1ecd1..8413f6c99 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -561,5 +561,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
minfo.fport = inc->i_hdr.h_dport;
}
+ minfo.flags = 0;
+
rds_info_copy(iter, &minfo, sizeof(minfo));
}
diff --git a/net/rds/send.c b/net/rds/send.c
index c9cdb358e..b1962f8e3 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -99,6 +99,7 @@ void rds_send_reset(struct rds_connection *conn)
list_splice_init(&conn->c_retrans, &conn->c_send_queue);
spin_unlock_irqrestore(&conn->c_lock, flags);
}
+EXPORT_SYMBOL_GPL(rds_send_reset);
static int acquire_in_xmit(struct rds_connection *conn)
{
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index c173f69e1..e381bbcd9 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -102,7 +102,8 @@ int rds_sysctl_init(void)
rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
- rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
+ rds_sysctl_reg_table =
+ register_net_sysctl(&init_net, "net/rds", rds_sysctl_rds_table);
if (!rds_sysctl_reg_table)
return -ENOMEM;
return 0;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 86187dad1..c8a7b4c90 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -126,9 +126,81 @@ void rds_tcp_restore_callbacks(struct socket *sock,
}
/*
- * This is the only path that sets tc->t_sock. Send and receive trust that
- * it is set. The RDS_CONN_UP bit protects those paths from being
- * called while it isn't set.
+ * rds_tcp_reset_callbacks() switches the to the new sock and
+ * returns the existing tc->t_sock.
+ *
+ * The only functions that set tc->t_sock are rds_tcp_set_callbacks
+ * and rds_tcp_reset_callbacks. Send and receive trust that
+ * it is set. The absence of RDS_CONN_UP bit protects those paths
+ * from being called while it isn't set.
+ */
+void rds_tcp_reset_callbacks(struct socket *sock,
+ struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *osock = tc->t_sock;
+
+ if (!osock)
+ goto newsock;
+
+ /* Need to resolve a duelling SYN between peers.
+ * We have an outstanding SYN to this peer, which may
+ * potentially have transitioned to the RDS_CONN_UP state,
+ * so we must quiesce any send threads before resetting
+ * c_transport_data. We quiesce these threads by setting
+ * c_state to something other than RDS_CONN_UP, and then
+ * waiting for any existing threads in rds_send_xmit to
+ * complete release_in_xmit(). (Subsequent threads entering
+ * rds_send_xmit() will bail on !rds_conn_up().
+ *
+ * However an incoming syn-ack at this point would end up
+ * marking the conn as RDS_CONN_UP, and would again permit
+ * rds_send_xmi() threads through, so ideally we would
+ * synchronize on RDS_CONN_UP after lock_sock(), but cannot
+ * do that: waiting on !RDS_IN_XMIT after lock_sock() may
+ * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
+ * would not get set. As a result, we set c_state to
+ * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
+ * cannot mark rds_conn_path_up() in the window before lock_sock()
+ */
+ atomic_set(&conn->c_state, RDS_CONN_RESETTING);
+ wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags));
+ lock_sock(osock->sk);
+ /* reset receive side state for rds_tcp_data_recv() for osock */
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ tc->t_sock = NULL;
+
+ write_lock_bh(&osock->sk->sk_callback_lock);
+
+ osock->sk->sk_user_data = NULL;
+ osock->sk->sk_data_ready = tc->t_orig_data_ready;
+ osock->sk->sk_write_space = tc->t_orig_write_space;
+ osock->sk->sk_state_change = tc->t_orig_state_change;
+ write_unlock_bh(&osock->sk->sk_callback_lock);
+ release_sock(osock->sk);
+ sock_release(osock);
+newsock:
+ rds_send_reset(conn);
+ lock_sock(sock->sk);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ tc->t_sock = sock;
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = rds_tcp_data_ready;
+ sock->sk->sk_write_space = rds_tcp_write_space;
+ sock->sk->sk_state_change = rds_tcp_state_change;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+ release_sock(sock->sk);
+}
+
+/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
+ * above rds_tcp_reset_callbacks for notes about synchronization
+ * with data path
*/
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
{
@@ -544,7 +616,7 @@ static int rds_tcp_init(void)
ret = rds_tcp_recv_init();
if (ret)
- goto out_slab;
+ goto out_pernet;
ret = rds_trans_register(&rds_tcp_transport);
if (ret)
@@ -556,8 +628,9 @@ static int rds_tcp_init(void)
out_recv:
rds_tcp_recv_exit();
-out_slab:
+out_pernet:
unregister_pernet_subsys(&rds_tcp_net_ops);
+out_slab:
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 41c228300..7940babf6 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,6 +50,7 @@ struct rds_tcp_statistics {
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
@@ -82,7 +83,7 @@ int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
void rds_tcp_xmit_prepare(struct rds_connection *conn);
void rds_tcp_xmit_complete(struct rds_connection *conn);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off);
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk);
/* tcp_stats.c */
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 49a3fcfed..f6e95d60d 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk)
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) {
state_change = sk->sk_state_change;
@@ -54,22 +54,22 @@ void rds_tcp_state_change(struct sock *sk)
rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
- switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- rds_connect_complete(conn);
- break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSE:
- rds_conn_drop(conn);
- default:
- break;
+ switch (sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSE:
+ rds_conn_drop(conn);
+ default:
+ break;
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index be263cdf2..245542ca4 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -78,7 +78,9 @@ int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
- struct sock *nsk;
+
+ if (!sock) /* module unload or netns delete in progress */
+ return -ENETUNREACH;
ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
sock->sk->sk_type, sock->sk->sk_protocol,
@@ -129,28 +131,25 @@ int rds_tcp_accept_one(struct socket *sock)
* so we must quiesce any send threads before resetting
* c_transport_data.
*/
- wait_event(conn->c_waitq,
- !test_bit(RDS_IN_XMIT, &conn->c_flags));
- if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+ if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) ||
+ !conn->c_outgoing) {
goto rst_nsk;
- } else if (rs_tcp->t_sock) {
- rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+ } else {
+ rds_tcp_reset_callbacks(new_sock, conn);
conn->c_outgoing = 0;
+ /* rds_connect_path_complete() marks RDS_CONN_UP */
+ rds_connect_path_complete(conn, RDS_CONN_RESETTING);
}
+ } else {
+ rds_tcp_set_callbacks(new_sock, conn);
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
}
- rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_complete(conn); /* marks RDS_CONN_UP */
new_sock = NULL;
ret = 0;
goto out;
rst_nsk:
/* reset the newly returned accept sock and bail */
- nsk = new_sock->sk;
- rds_tcp_stats_inc(s_tcp_listen_closed_stale);
- nsk->sk_user_data = NULL;
- nsk->sk_prot->disconnect(nsk, 0);
- tcp_done(nsk);
- new_sock = NULL;
+ kernel_sock_shutdown(new_sock, SHUT_RDWR);
ret = 0;
out:
if (rs_tcp)
@@ -166,7 +165,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
rdsdebug("listen data ready sk %p\n", sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (!ready) { /* check for teardown race */
ready = sk->sk_data_ready;
@@ -183,7 +182,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
rds_tcp_accept_work(sk);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 27a992154..6e6a7111a 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -171,7 +171,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
while (left) {
if (!tinc) {
tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
- arg->gfp);
+ arg->gfp);
if (!tinc) {
desc->error = -ENOMEM;
goto out;
@@ -207,22 +207,14 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
if (left && tc->t_tinc_data_rem) {
- clone = skb_clone(skb, arg->gfp);
+ to_copy = min(tc->t_tinc_data_rem, left);
+
+ clone = pskb_extract(skb, offset, to_copy, arg->gfp);
if (!clone) {
desc->error = -ENOMEM;
goto out;
}
- to_copy = min(tc->t_tinc_data_rem, left);
- if (!pskb_pull(clone, offset) ||
- pskb_trim(clone, to_copy)) {
- pr_warn("rds_tcp_data_recv: pull/trim failed "
- "left %zu data_rem %zu skb_len %d\n",
- left, tc->t_tinc_data_rem, skb->len);
- kfree_skb(clone);
- desc->error = -ENOMEM;
- goto out;
- }
skb_queue_tail(&tinc->ti_skb_list, clone);
rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
@@ -309,7 +301,7 @@ void rds_tcp_data_ready(struct sock *sk)
rdsdebug("data ready sk %p\n", sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) { /* check for teardown race */
ready = sk->sk_data_ready;
@@ -323,7 +315,7 @@ void rds_tcp_data_ready(struct sock *sk)
if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 2894e6095..618be69c9 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -66,19 +66,19 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
{
struct kvec vec = {
- .iov_base = data,
- .iov_len = len,
+ .iov_base = data,
+ .iov_len = len,
+ };
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
};
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
- };
return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
}
/* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off)
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
@@ -180,7 +180,7 @@ void rds_tcp_write_space(struct sock *sk)
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) {
write_space = sk->sk_write_space;
@@ -196,11 +196,11 @@ void rds_tcp_write_space(struct sock *sk)
tc->t_last_seen_una = rds_tcp_snd_una(tc);
rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
/*
* write_space is only called when data leaves tcp's send queue if
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 454aa6d23..4a3230457 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -71,9 +71,9 @@
struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq);
-void rds_connect_complete(struct rds_connection *conn)
+void rds_connect_path_complete(struct rds_connection *conn, int curr)
{
- if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+ if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n",
__func__,
@@ -90,6 +90,12 @@ void rds_connect_complete(struct rds_connection *conn)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_connect_path_complete);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+}
EXPORT_SYMBOL_GPL(rds_connect_complete);
/*
diff --git a/net/rds/transport.c b/net/rds/transport.c
index f3afd1d60..2ffd3e30c 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -140,8 +140,7 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
rds_info_iter_unmap(iter);
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++)
- {
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
if (!trans || !trans->stats_info_copy)
continue;
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 03f26e3a6..884027f62 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1141,6 +1141,7 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
{
struct rfkill *rfkill;
struct rfkill_event ev;
+ int ret;
/* we don't need the 'hard' variable but accept it */
if (count < RFKILL_EVENT_SIZE_V1 - 1)
@@ -1155,29 +1156,36 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
if (copy_from_user(&ev, buf, count))
return -EFAULT;
- if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL)
- return -EINVAL;
-
if (ev.type >= NUM_RFKILL_TYPES)
return -EINVAL;
mutex_lock(&rfkill_global_mutex);
- if (ev.op == RFKILL_OP_CHANGE_ALL)
+ switch (ev.op) {
+ case RFKILL_OP_CHANGE_ALL:
rfkill_update_global_state(ev.type, ev.soft);
-
- list_for_each_entry(rfkill, &rfkill_list, node) {
- if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL)
- continue;
-
- if (rfkill->type != ev.type && ev.type != RFKILL_TYPE_ALL)
- continue;
-
- rfkill_set_block(rfkill, ev.soft);
+ list_for_each_entry(rfkill, &rfkill_list, node)
+ if (rfkill->type == ev.type ||
+ ev.type == RFKILL_TYPE_ALL)
+ rfkill_set_block(rfkill, ev.soft);
+ ret = 0;
+ break;
+ case RFKILL_OP_CHANGE:
+ list_for_each_entry(rfkill, &rfkill_list, node)
+ if (rfkill->idx == ev.idx &&
+ (rfkill->type == ev.type ||
+ ev.type == RFKILL_TYPE_ALL))
+ rfkill_set_block(rfkill, ev.soft);
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
}
+
mutex_unlock(&rfkill_global_mutex);
- return count;
+ return ret ?: count;
}
static int rfkill_fop_release(struct inode *inode, struct file *file)
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index 79c4abcfa..0a6394754 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -164,7 +164,8 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_frames_acked(sk, nr);
if (ns == rose->vr) {
rose_start_idletimer(sk);
- if (sock_queue_rcv_skb(sk, skb) == 0) {
+ if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 &&
+ __sock_queue_rcv_skb(sk, skb) == 0) {
rose->vr = (rose->vr + 1) % ROSE_MODULUS;
queued = 1;
} else {
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 23dcef12b..784c53163 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -30,7 +30,7 @@ config AF_RXRPC_DEBUG
config RXKAD
- tristate "RxRPC Kerberos security"
+ bool "RxRPC Kerberos security"
depends on AF_RXRPC
select CRYPTO
select CRYPTO_MANAGER
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index ec126f912..e05a06ef2 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -18,11 +18,12 @@ af-rxrpc-y := \
ar-recvmsg.o \
ar-security.o \
ar-skbuff.o \
- ar-transport.o
+ ar-transport.o \
+ insecure.o \
+ misc.o
af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o
+af-rxrpc-$(CONFIG_RXKAD) += rxkad.o
af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
-
-obj-$(CONFIG_RXKAD) += rxkad.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 9d935fa5a..e45e94ca0 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -806,6 +806,12 @@ static int __init af_rxrpc_init(void)
goto error_work_queue;
}
+ ret = rxrpc_init_security();
+ if (ret < 0) {
+ printk(KERN_CRIT "RxRPC: Cannot initialise security\n");
+ goto error_security;
+ }
+
ret = proto_register(&rxrpc_proto, 1);
if (ret < 0) {
printk(KERN_CRIT "RxRPC: Cannot register protocol\n");
@@ -853,6 +859,8 @@ error_sock:
proto_unregister(&rxrpc_proto);
error_proto:
destroy_workqueue(rxrpc_workqueue);
+error_security:
+ rxrpc_exit_security();
error_work_queue:
kmem_cache_destroy(rxrpc_call_jar);
error_call_jar:
@@ -883,6 +891,7 @@ static void __exit af_rxrpc_exit(void)
remove_proc_entry("rxrpc_conns", init_net.proc_net);
remove_proc_entry("rxrpc_calls", init_net.proc_net);
destroy_workqueue(rxrpc_workqueue);
+ rxrpc_exit_security();
kmem_cache_destroy(rxrpc_call_jar);
_leave("");
}
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 277731a5e..e7a7f05f1 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -108,7 +108,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
goto error;
}
- conn = rxrpc_incoming_connection(trans, &sp->hdr, GFP_NOIO);
+ conn = rxrpc_incoming_connection(trans, &sp->hdr);
rxrpc_put_transport(trans);
if (IS_ERR(conn)) {
_debug("no conn");
@@ -116,7 +116,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
goto error;
}
- call = rxrpc_incoming_call(rx, conn, &sp->hdr, GFP_NOIO);
+ call = rxrpc_incoming_call(rx, conn, &sp->hdr);
rxrpc_put_connection(conn);
if (IS_ERR(call)) {
_debug("no call");
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index 16d967075..374478e00 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -20,74 +20,6 @@
#include "ar-internal.h"
/*
- * How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in jiffies).
- */
-unsigned int rxrpc_requested_ack_delay = 1;
-
-/*
- * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
- *
- * We use this when we've received new data packets. If those packets aren't
- * all consumed within this time we will send a DELAY ACK if an ACK was not
- * requested to let the sender know it doesn't need to resend.
- */
-unsigned int rxrpc_soft_ack_delay = 1 * HZ;
-
-/*
- * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
- *
- * We use this when we've consumed some previously soft-ACK'd packets when
- * further packets aren't immediately received to decide when to send an IDLE
- * ACK let the other end know that it can free up its Tx buffer space.
- */
-unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
-
-/*
- * Receive window size in packets. This indicates the maximum number of
- * unconsumed received packets we're willing to retain in memory. Once this
- * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
- * packets.
- */
-unsigned int rxrpc_rx_window_size = 32;
-
-/*
- * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
- * made by gluing normal packets together that we're willing to handle.
- */
-unsigned int rxrpc_rx_mtu = 5692;
-
-/*
- * The maximum number of fragments in a received jumbo packet that we tell the
- * sender that we're willing to handle.
- */
-unsigned int rxrpc_rx_jumbo_max = 4;
-
-static const char *rxrpc_acks(u8 reason)
-{
- static const char *const str[] = {
- "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
- "IDL", "-?-"
- };
-
- if (reason >= ARRAY_SIZE(str))
- reason = ARRAY_SIZE(str) - 1;
- return str[reason];
-}
-
-static const s8 rxrpc_ack_priority[] = {
- [0] = 0,
- [RXRPC_ACK_DELAY] = 1,
- [RXRPC_ACK_REQUESTED] = 2,
- [RXRPC_ACK_IDLE] = 3,
- [RXRPC_ACK_PING_RESPONSE] = 4,
- [RXRPC_ACK_DUPLICATE] = 5,
- [RXRPC_ACK_OUT_OF_SEQUENCE] = 6,
- [RXRPC_ACK_EXCEEDS_WINDOW] = 7,
- [RXRPC_ACK_NOSPACE] = 8,
-};
-
-/*
* propose an ACK be sent
*/
void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
@@ -426,7 +358,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
int tail = call->acks_tail, old_tail;
int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
- kenter("{%u,%u},%u", call->acks_hard, win, hard);
+ _enter("{%u,%u},%u", call->acks_hard, win, hard);
ASSERTCMP(hard - call->acks_hard, <=, win);
@@ -656,7 +588,8 @@ process_further:
_proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
/* secured packets must be verified and possibly decrypted */
- if (rxrpc_verify_packet(call, skb, _abort_code) < 0)
+ if (call->conn->security->verify_packet(call, skb,
+ _abort_code) < 0)
goto protocol_error;
rxrpc_insert_oos_packet(call, skb);
@@ -901,8 +834,8 @@ void rxrpc_process_call(struct work_struct *work)
/* there's a good chance we're going to have to send a message, so set
* one up in advance */
- msg.msg_name = &call->conn->trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(call->conn->trans->peer->srx.transport.sin);
+ msg.msg_name = &call->conn->trans->peer->srx.transport;
+ msg.msg_namelen = call->conn->trans->peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -973,7 +906,7 @@ void rxrpc_process_call(struct work_struct *work)
ECONNABORTED, true) < 0)
goto no_mem;
whdr.type = RXRPC_PACKET_TYPE_ABORT;
- data = htonl(call->abort_code);
+ data = htonl(call->local_abort);
iov[1].iov_base = &data;
iov[1].iov_len = sizeof(data);
genbit = RXRPC_CALL_EV_ABORT;
@@ -1036,7 +969,7 @@ void rxrpc_process_call(struct work_struct *work)
write_lock_bh(&call->state_lock);
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->abort_code = RX_CALL_TIMEOUT;
+ call->local_abort = RX_CALL_TIMEOUT;
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
}
write_unlock_bh(&call->state_lock);
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index 7c8d300ad..571a41fd5 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -411,18 +411,17 @@ found_extant_second:
*/
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_connection *conn,
- struct rxrpc_host_header *hdr,
- gfp_t gfp)
+ struct rxrpc_host_header *hdr)
{
struct rxrpc_call *call, *candidate;
struct rb_node **p, *parent;
u32 call_id;
- _enter(",%d,,%x", conn->debug_id, gfp);
+ _enter(",%d", conn->debug_id);
ASSERT(rx != NULL);
- candidate = rxrpc_alloc_call(gfp);
+ candidate = rxrpc_alloc_call(GFP_NOIO);
if (!candidate)
return ERR_PTR(-EBUSY);
@@ -682,7 +681,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
_debug("+++ ABORTING STATE %d +++\n", call->state);
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->abort_code = RX_CALL_DEAD;
+ call->local_abort = RX_CALL_DEAD;
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
@@ -758,7 +757,7 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call)
if (call->state < RXRPC_CALL_COMPLETE) {
_debug("abort call %p", call);
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->abort_code = RX_CALL_DEAD;
+ call->local_abort = RX_CALL_DEAD;
if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
sched = true;
}
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 9942da1ed..97f4fae74 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -207,6 +207,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
INIT_LIST_HEAD(&conn->bundle_link);
conn->calls = RB_ROOT;
skb_queue_head_init(&conn->rx_queue);
+ conn->security = &rxrpc_no_security;
rwlock_init(&conn->lock);
spin_lock_init(&conn->state_lock);
atomic_set(&conn->usage, 1);
@@ -564,8 +565,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
candidate->debug_id, candidate->trans->debug_id);
rxrpc_assign_connection_id(candidate);
- if (candidate->security)
- candidate->security->prime_packet_security(candidate);
+ candidate->security->prime_packet_security(candidate);
/* leave the candidate lurking in zombie mode attached to the
* bundle until we're ready for it */
@@ -619,8 +619,7 @@ interrupted:
*/
struct rxrpc_connection *
rxrpc_incoming_connection(struct rxrpc_transport *trans,
- struct rxrpc_host_header *hdr,
- gfp_t gfp)
+ struct rxrpc_host_header *hdr)
{
struct rxrpc_connection *conn, *candidate = NULL;
struct rb_node *p, **pp;
@@ -659,7 +658,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
/* not yet present - create a candidate for a new record and then
* redo the search */
- candidate = rxrpc_alloc_connection(gfp);
+ candidate = rxrpc_alloc_connection(GFP_NOIO);
if (!candidate) {
_leave(" = -ENOMEM");
return ERR_PTR(-ENOMEM);
@@ -831,7 +830,10 @@ static void rxrpc_destroy_connection(struct rxrpc_connection *conn)
ASSERT(RB_EMPTY_ROOT(&conn->calls));
rxrpc_purge_queue(&conn->rx_queue);
- rxrpc_clear_conn_security(conn);
+ conn->security->clear(conn);
+ key_put(conn->key);
+ key_put(conn->server_key);
+
rxrpc_put_transport(conn->trans);
kfree(conn);
_leave("");
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
index 1bdaaed8c..5f9563968 100644
--- a/net/rxrpc/ar-connevent.c
+++ b/net/rxrpc/ar-connevent.c
@@ -40,11 +40,13 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
write_lock(&call->state_lock);
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = state;
- call->abort_code = abort_code;
- if (state == RXRPC_CALL_LOCALLY_ABORTED)
+ if (state == RXRPC_CALL_LOCALLY_ABORTED) {
+ call->local_abort = conn->local_abort;
set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
- else
+ } else {
+ call->remote_abort = conn->remote_abort;
set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
+ }
rxrpc_queue_call(call);
}
write_unlock(&call->state_lock);
@@ -84,8 +86,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code);
- msg.msg_name = &conn->trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_name = &conn->trans->peer->srx.transport;
+ msg.msg_namelen = conn->trans->peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -101,7 +103,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
whdr._rsvd = 0;
whdr.serviceId = htons(conn->service_id);
- word = htonl(abort_code);
+ word = htonl(conn->local_abort);
iov[0].iov_base = &whdr;
iov[0].iov_len = sizeof(whdr);
@@ -112,7 +114,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
serial = atomic_inc_return(&conn->serial);
whdr.serial = htonl(serial);
- _proto("Tx CONN ABORT %%%u { %d }", serial, abort_code);
+ _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort);
ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
if (ret < 0) {
@@ -172,15 +174,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
return -ECONNABORTED;
case RXRPC_PACKET_TYPE_CHALLENGE:
- if (conn->security)
- return conn->security->respond_to_challenge(
- conn, skb, _abort_code);
- return -EPROTO;
+ return conn->security->respond_to_challenge(conn, skb,
+ _abort_code);
case RXRPC_PACKET_TYPE_RESPONSE:
- if (!conn->security)
- return -EPROTO;
-
ret = conn->security->verify_response(conn, skb, _abort_code);
if (ret < 0)
return ret;
@@ -236,8 +233,6 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn)
}
}
- ASSERT(conn->security != NULL);
-
if (conn->security->issue_challenge(conn) < 0) {
abort_code = RX_CALL_DEAD;
ret = -ENOMEM;
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 63ed75c40..6ff97412a 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -25,12 +25,6 @@
#include <net/net_namespace.h>
#include "ar-internal.h"
-const char *rxrpc_pkts[] = {
- "?00",
- "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
- "?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
-};
-
/*
* queue a packet for recvmsg to pass to userspace
* - the caller must hold a lock on call->lock
@@ -199,7 +193,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
/* if the packet need security things doing to it, then it goes down
* the slow path */
- if (call->conn->security)
+ if (call->conn->security_ix)
goto enqueue_packet;
sp->call = call;
@@ -355,7 +349,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
write_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_REMOTELY_ABORTED;
- call->abort_code = abort_code;
+ call->remote_abort = abort_code;
set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
rxrpc_queue_call(call);
}
@@ -428,7 +422,7 @@ protocol_error:
protocol_error_locked:
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->abort_code = RX_PROTOCOL_ERROR;
+ call->local_abort = RX_PROTOCOL_ERROR;
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
@@ -500,7 +494,7 @@ protocol_error:
write_lock_bh(&call->state_lock);
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->abort_code = RX_PROTOCOL_ERROR;
+ call->local_abort = RX_PROTOCOL_ERROR;
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
@@ -612,9 +606,9 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
struct rxrpc_wire_header whdr;
/* dig out the RxRPC connection details */
- if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0)
+ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
return -EBADMSG;
- if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr)))
+ if (!pskb_pull(skb, sizeof(whdr)))
BUG();
memset(sp, 0, sizeof(*sp));
@@ -704,12 +698,12 @@ void rxrpc_data_ready(struct sock *sk)
if (skb_checksum_complete(skb)) {
rxrpc_free_skb(skb);
rxrpc_put_local(local);
- UDP_INC_STATS_BH(&init_net, UDP_MIB_INERRORS, 0);
+ __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
_leave(" [CSUM failed]");
return;
}
- UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0);
+ __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
/* The socket buffer we have is owned by UDP, with UDP's data all over
* it, but we really want our own data there.
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index cd6cdbe87..f0b807a16 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -9,6 +9,7 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <net/sock.h>
#include <rxrpc/packet.h>
#if 0
@@ -124,11 +125,15 @@ enum rxrpc_command {
* RxRPC security module interface
*/
struct rxrpc_security {
- struct module *owner; /* providing module */
- struct list_head link; /* link in master list */
const char *name; /* name of this service */
u8 security_index; /* security type provided */
+ /* Initialise a security service */
+ int (*init)(void);
+
+ /* Clean up a security service */
+ void (*exit)(void);
+
/* initialise a connection's security */
int (*init_connection_security)(struct rxrpc_connection *);
@@ -268,7 +273,7 @@ struct rxrpc_connection {
struct rb_root calls; /* calls on this connection */
struct sk_buff_head rx_queue; /* received conn-level packets */
struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */
- struct rxrpc_security *security; /* applied security module */
+ const struct rxrpc_security *security; /* applied security module */
struct key *key; /* security for this connection (client) */
struct key *server_key; /* security for this service */
struct crypto_skcipher *cipher; /* encryption handle */
@@ -289,7 +294,9 @@ struct rxrpc_connection {
RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */
RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */
} state;
- int error; /* error code for local abort */
+ u32 local_abort; /* local abort code */
+ u32 remote_abort; /* remote abort code */
+ int error; /* local error incurred */
int debug_id; /* debug ID for printks */
unsigned int call_counter; /* call ID counter */
atomic_t serial; /* packet serial number counter */
@@ -399,7 +406,9 @@ struct rxrpc_call {
rwlock_t state_lock; /* lock for state transition */
atomic_t usage;
atomic_t sequence; /* Tx data packet sequence counter */
- u32 abort_code; /* local/remote abort code */
+ u32 local_abort; /* local abort code */
+ u32 remote_abort; /* remote abort code */
+ int error; /* local error incurred */
enum rxrpc_call_state state : 8; /* current state of call */
int debug_id; /* debug ID for printks */
u8 channel; /* connection channel occupied by this call */
@@ -453,7 +462,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
{
write_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE) {
- call->abort_code = abort_code;
+ call->local_abort = abort_code;
call->state = RXRPC_CALL_LOCALLY_ABORTED;
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
}
@@ -478,13 +487,6 @@ int rxrpc_reject_call(struct rxrpc_sock *);
/*
* ar-ack.c
*/
-extern unsigned int rxrpc_requested_ack_delay;
-extern unsigned int rxrpc_soft_ack_delay;
-extern unsigned int rxrpc_idle_ack_delay;
-extern unsigned int rxrpc_rx_window_size;
-extern unsigned int rxrpc_rx_mtu;
-extern unsigned int rxrpc_rx_jumbo_max;
-
void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_process_call(struct work_struct *);
@@ -506,7 +508,7 @@ struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
unsigned long, int, gfp_t);
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
struct rxrpc_connection *,
- struct rxrpc_host_header *, gfp_t);
+ struct rxrpc_host_header *);
struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long);
void rxrpc_release_call(struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
@@ -531,8 +533,7 @@ void __exit rxrpc_destroy_all_connections(void);
struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
struct rxrpc_host_header *);
extern struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *,
- gfp_t);
+rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *);
/*
* ar-connevent.c
@@ -550,8 +551,6 @@ void rxrpc_UDP_error_handler(struct work_struct *);
/*
* ar-input.c
*/
-extern const char *rxrpc_pkts[];
-
void rxrpc_data_ready(struct sock *);
int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
@@ -610,14 +609,10 @@ int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
/*
* ar-security.c
*/
-int rxrpc_register_security(struct rxrpc_security *);
-void rxrpc_unregister_security(struct rxrpc_security *);
+int __init rxrpc_init_security(void);
+void rxrpc_exit_security(void);
int rxrpc_init_client_conn_security(struct rxrpc_connection *);
int rxrpc_init_server_conn_security(struct rxrpc_connection *);
-int rxrpc_secure_packet(const struct rxrpc_call *, struct sk_buff *, size_t,
- void *);
-int rxrpc_verify_packet(const struct rxrpc_call *, struct sk_buff *, u32 *);
-void rxrpc_clear_conn_security(struct rxrpc_connection *);
/*
* ar-skbuff.c
@@ -637,6 +632,33 @@ struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *,
struct rxrpc_peer *);
/*
+ * insecure.c
+ */
+extern const struct rxrpc_security rxrpc_no_security;
+
+/*
+ * misc.c
+ */
+extern unsigned int rxrpc_requested_ack_delay;
+extern unsigned int rxrpc_soft_ack_delay;
+extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned int rxrpc_rx_window_size;
+extern unsigned int rxrpc_rx_mtu;
+extern unsigned int rxrpc_rx_jumbo_max;
+
+extern const char *const rxrpc_pkts[];
+extern const s8 rxrpc_ack_priority[];
+
+extern const char *rxrpc_acks(u8 reason);
+
+/*
+ * rxkad.c
+ */
+#ifdef CONFIG_RXKAD
+extern const struct rxrpc_security rxkad;
+#endif
+
+/*
* sysctl.c
*/
#ifdef CONFIG_SYSCTL
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 3fb492eed..1021b4c0b 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -965,7 +965,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
key = key_alloc(&key_type_rxrpc, "x",
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
- KEY_ALLOC_NOT_IN_QUOTA);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(key)) {
_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
return -ENOMEM;
@@ -1012,7 +1012,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
key = key_alloc(&key_type_rxrpc, keyname,
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
- KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+ KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(key))
return key;
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index d36fb6e1a..51cb10062 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -110,7 +110,7 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->abort_code = abort_code;
+ call->local_abort = abort_code;
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
del_timer_sync(&call->resend_timer);
del_timer_sync(&call->ack_timer);
@@ -663,7 +663,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
size_t pad;
/* pad out if we're using security */
- if (conn->security) {
+ if (conn->security_ix) {
pad = conn->security_size + skb->mark;
pad = conn->size_align - pad;
pad &= conn->size_align - 1;
@@ -695,7 +695,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
if (more && seq & 1)
sp->hdr.flags |= RXRPC_REQUEST_ACK;
- ret = rxrpc_secure_packet(
+ ret = conn->security->secure_packet(
call, skb, skb->mark,
skb->head + sizeof(struct rxrpc_wire_header));
if (ret < 0)
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c
index 525b2ba5a..225163bc6 100644
--- a/net/rxrpc/ar-proc.c
+++ b/net/rxrpc/ar-proc.c
@@ -80,7 +80,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
call->conn->in_clientflag ? "Svc" : "Clt",
atomic_read(&call->usage),
rxrpc_call_states[call->state],
- call->abort_code,
+ call->remote_abort ?: call->local_abort,
call->user_call_ID);
return 0;
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index 64facba24..160f0927a 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -288,7 +288,11 @@ receive_non_data_message:
ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
break;
case RXRPC_SKB_MARK_REMOTE_ABORT:
- abort_code = call->abort_code;
+ abort_code = call->remote_abort;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_LOCAL_ABORT:
+ abort_code = call->local_abort;
ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
break;
case RXRPC_SKB_MARK_NET_ERROR:
@@ -303,6 +307,7 @@ receive_non_data_message:
&abort_code);
break;
default:
+ pr_err("RxRPC: Unknown packet mark %u\n", skb->mark);
BUG();
break;
}
@@ -401,9 +406,14 @@ u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_REMOTE_ABORT);
-
- return sp->call->abort_code;
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_REMOTE_ABORT:
+ return sp->call->remote_abort;
+ case RXRPC_SKB_MARK_LOCAL_ABORT:
+ return sp->call->local_abort;
+ default:
+ BUG();
+ }
}
EXPORT_SYMBOL(rxrpc_kernel_get_abort_code);
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
index ceff6394a..d223253b2 100644
--- a/net/rxrpc/ar-security.c
+++ b/net/rxrpc/ar-security.c
@@ -22,109 +22,60 @@
static LIST_HEAD(rxrpc_security_methods);
static DECLARE_RWSEM(rxrpc_security_sem);
-/*
- * get an RxRPC security module
- */
-static struct rxrpc_security *rxrpc_security_get(struct rxrpc_security *sec)
-{
- return try_module_get(sec->owner) ? sec : NULL;
-}
-
-/*
- * release an RxRPC security module
- */
-static void rxrpc_security_put(struct rxrpc_security *sec)
+static const struct rxrpc_security *rxrpc_security_types[] = {
+ [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
+#ifdef CONFIG_RXKAD
+ [RXRPC_SECURITY_RXKAD] = &rxkad,
+#endif
+};
+
+int __init rxrpc_init_security(void)
{
- module_put(sec->owner);
-}
-
-/*
- * look up an rxrpc security module
- */
-static struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
-{
- struct rxrpc_security *sec = NULL;
-
- _enter("");
+ int i, ret;
- down_read(&rxrpc_security_sem);
-
- list_for_each_entry(sec, &rxrpc_security_methods, link) {
- if (sec->security_index == security_index) {
- if (unlikely(!rxrpc_security_get(sec)))
- break;
- goto out;
+ for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) {
+ if (rxrpc_security_types[i]) {
+ ret = rxrpc_security_types[i]->init();
+ if (ret < 0)
+ goto failed;
}
}
- sec = NULL;
-out:
- up_read(&rxrpc_security_sem);
- _leave(" = %p [%s]", sec, sec ? sec->name : "");
- return sec;
+ return 0;
+
+failed:
+ for (i--; i >= 0; i--)
+ if (rxrpc_security_types[i])
+ rxrpc_security_types[i]->exit();
+ return ret;
}
-/**
- * rxrpc_register_security - register an RxRPC security handler
- * @sec: security module
- *
- * register an RxRPC security handler for use by RxRPC
- */
-int rxrpc_register_security(struct rxrpc_security *sec)
+void rxrpc_exit_security(void)
{
- struct rxrpc_security *psec;
- int ret;
+ int i;
- _enter("");
- down_write(&rxrpc_security_sem);
-
- ret = -EEXIST;
- list_for_each_entry(psec, &rxrpc_security_methods, link) {
- if (psec->security_index == sec->security_index)
- goto out;
- }
-
- list_add(&sec->link, &rxrpc_security_methods);
-
- printk(KERN_NOTICE "RxRPC: Registered security type %d '%s'\n",
- sec->security_index, sec->name);
- ret = 0;
-
-out:
- up_write(&rxrpc_security_sem);
- _leave(" = %d", ret);
- return ret;
+ for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++)
+ if (rxrpc_security_types[i])
+ rxrpc_security_types[i]->exit();
}
-EXPORT_SYMBOL_GPL(rxrpc_register_security);
-
-/**
- * rxrpc_unregister_security - unregister an RxRPC security handler
- * @sec: security module
- *
- * unregister an RxRPC security handler
+/*
+ * look up an rxrpc security module
*/
-void rxrpc_unregister_security(struct rxrpc_security *sec)
+static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
{
-
- _enter("");
- down_write(&rxrpc_security_sem);
- list_del_init(&sec->link);
- up_write(&rxrpc_security_sem);
-
- printk(KERN_NOTICE "RxRPC: Unregistered security type %d '%s'\n",
- sec->security_index, sec->name);
+ if (security_index >= ARRAY_SIZE(rxrpc_security_types))
+ return NULL;
+ return rxrpc_security_types[security_index];
}
-EXPORT_SYMBOL_GPL(rxrpc_unregister_security);
-
/*
* initialise the security on a client connection
*/
int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
{
+ const struct rxrpc_security *sec;
struct rxrpc_key_token *token;
- struct rxrpc_security *sec;
struct key *key = conn->key;
int ret;
@@ -148,8 +99,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
ret = conn->security->init_connection_security(conn);
if (ret < 0) {
- rxrpc_security_put(conn->security);
- conn->security = NULL;
+ conn->security = &rxrpc_no_security;
return ret;
}
@@ -162,7 +112,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
*/
int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
{
- struct rxrpc_security *sec;
+ const struct rxrpc_security *sec;
struct rxrpc_local *local = conn->trans->local;
struct rxrpc_sock *rx;
struct key *key;
@@ -188,14 +138,12 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
/* the service appears to have died */
read_unlock_bh(&local->services_lock);
- rxrpc_security_put(sec);
_leave(" = -ENOENT");
return -ENOENT;
found_service:
if (!rx->securities) {
read_unlock_bh(&local->services_lock);
- rxrpc_security_put(sec);
_leave(" = -ENOKEY");
return -ENOKEY;
}
@@ -205,7 +153,6 @@ found_service:
&key_type_rxrpc_s, kdesc);
if (IS_ERR(kref)) {
read_unlock_bh(&local->services_lock);
- rxrpc_security_put(sec);
_leave(" = %ld [search]", PTR_ERR(kref));
return PTR_ERR(kref);
}
@@ -219,46 +166,3 @@ found_service:
_leave(" = 0");
return 0;
}
-
-/*
- * secure a packet prior to transmission
- */
-int rxrpc_secure_packet(const struct rxrpc_call *call,
- struct sk_buff *skb,
- size_t data_size,
- void *sechdr)
-{
- if (call->conn->security)
- return call->conn->security->secure_packet(
- call, skb, data_size, sechdr);
- return 0;
-}
-
-/*
- * secure a packet prior to transmission
- */
-int rxrpc_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb,
- u32 *_abort_code)
-{
- if (call->conn->security)
- return call->conn->security->verify_packet(
- call, skb, _abort_code);
- return 0;
-}
-
-/*
- * clear connection security
- */
-void rxrpc_clear_conn_security(struct rxrpc_connection *conn)
-{
- _enter("{%d}", conn->debug_id);
-
- if (conn->security) {
- conn->security->clear(conn);
- rxrpc_security_put(conn->security);
- conn->security = NULL;
- }
-
- key_put(conn->key);
- key_put(conn->server_key);
-}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
new file mode 100644
index 000000000..e57140361
--- /dev/null
+++ b/net/rxrpc/insecure.c
@@ -0,0 +1,83 @@
+/* Null security operations.
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static int none_init_connection_security(struct rxrpc_connection *conn)
+{
+ return 0;
+}
+
+static void none_prime_packet_security(struct rxrpc_connection *conn)
+{
+}
+
+static int none_secure_packet(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ size_t data_size,
+ void *sechdr)
+{
+ return 0;
+}
+
+static int none_verify_packet(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ return 0;
+}
+
+static int none_respond_to_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ *_abort_code = RX_PROTOCOL_ERROR;
+ return -EPROTO;
+}
+
+static int none_verify_response(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ *_abort_code = RX_PROTOCOL_ERROR;
+ return -EPROTO;
+}
+
+static void none_clear(struct rxrpc_connection *conn)
+{
+}
+
+static int none_init(void)
+{
+ return 0;
+}
+
+static void none_exit(void)
+{
+}
+
+/*
+ * RxRPC Kerberos-based security
+ */
+const struct rxrpc_security rxrpc_no_security = {
+ .name = "none",
+ .security_index = RXRPC_SECURITY_NONE,
+ .init = none_init,
+ .exit = none_exit,
+ .init_connection_security = none_init_connection_security,
+ .prime_packet_security = none_prime_packet_security,
+ .secure_packet = none_secure_packet,
+ .verify_packet = none_verify_packet,
+ .respond_to_challenge = none_respond_to_challenge,
+ .verify_response = none_verify_response,
+ .clear = none_clear,
+};
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
new file mode 100644
index 000000000..1afe9876e
--- /dev/null
+++ b/net/rxrpc/misc.c
@@ -0,0 +1,89 @@
+/* Miscellaneous bits
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * How long to wait before scheduling ACK generation after seeing a
+ * packet with RXRPC_REQUEST_ACK set (in jiffies).
+ */
+unsigned int rxrpc_requested_ack_delay = 1;
+
+/*
+ * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
+ *
+ * We use this when we've received new data packets. If those packets aren't
+ * all consumed within this time we will send a DELAY ACK if an ACK was not
+ * requested to let the sender know it doesn't need to resend.
+ */
+unsigned int rxrpc_soft_ack_delay = 1 * HZ;
+
+/*
+ * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
+ *
+ * We use this when we've consumed some previously soft-ACK'd packets when
+ * further packets aren't immediately received to decide when to send an IDLE
+ * ACK let the other end know that it can free up its Tx buffer space.
+ */
+unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
+
+/*
+ * Receive window size in packets. This indicates the maximum number of
+ * unconsumed received packets we're willing to retain in memory. Once this
+ * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
+ * packets.
+ */
+unsigned int rxrpc_rx_window_size = 32;
+
+/*
+ * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
+ * made by gluing normal packets together that we're willing to handle.
+ */
+unsigned int rxrpc_rx_mtu = 5692;
+
+/*
+ * The maximum number of fragments in a received jumbo packet that we tell the
+ * sender that we're willing to handle.
+ */
+unsigned int rxrpc_rx_jumbo_max = 4;
+
+const char *const rxrpc_pkts[] = {
+ "?00",
+ "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
+ "?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
+};
+
+const s8 rxrpc_ack_priority[] = {
+ [0] = 0,
+ [RXRPC_ACK_DELAY] = 1,
+ [RXRPC_ACK_REQUESTED] = 2,
+ [RXRPC_ACK_IDLE] = 3,
+ [RXRPC_ACK_PING_RESPONSE] = 4,
+ [RXRPC_ACK_DUPLICATE] = 5,
+ [RXRPC_ACK_OUT_OF_SEQUENCE] = 6,
+ [RXRPC_ACK_EXCEEDS_WINDOW] = 7,
+ [RXRPC_ACK_NOSPACE] = 8,
+};
+
+const char *rxrpc_acks(u8 reason)
+{
+ static const char *const str[] = {
+ "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+ "IDL", "-?-"
+ };
+
+ if (reason >= ARRAY_SIZE(str))
+ reason = ARRAY_SIZE(str) - 1;
+ return str[reason];
+}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index f0aeb8163..bab56ed64 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -20,7 +20,6 @@
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <keys/rxrpc-type.h>
-#define rxrpc_debug rxkad_debug
#include "ar-internal.h"
#define RXKAD_VERSION 2
@@ -31,10 +30,6 @@
#define REALM_SZ 40 /* size of principal's auth domain */
#define SNAME_SZ 40 /* size of service name */
-unsigned int rxrpc_debug;
-module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(debug, "rxkad debugging mask");
-
struct rxkad_level1_hdr {
__be32 data_size; /* true data size (excluding padding) */
};
@@ -44,10 +39,6 @@ struct rxkad_level2_hdr {
__be32 checksum; /* decrypted data checksum */
};
-MODULE_DESCRIPTION("RxRPC network protocol type-2 security (Kerberos 4)");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
/*
* this holds a pinned cipher so that keventd doesn't get called by the cipher
* alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
@@ -1164,12 +1155,33 @@ static void rxkad_clear(struct rxrpc_connection *conn)
}
/*
+ * Initialise the rxkad security service.
+ */
+static int rxkad_init(void)
+{
+ /* pin the cipher we need so that the crypto layer doesn't invoke
+ * keventd to go get it */
+ rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+ return PTR_ERR_OR_ZERO(rxkad_ci);
+}
+
+/*
+ * Clean up the rxkad security service.
+ */
+static void rxkad_exit(void)
+{
+ if (rxkad_ci)
+ crypto_free_skcipher(rxkad_ci);
+}
+
+/*
* RxRPC Kerberos-based security
*/
-static struct rxrpc_security rxkad = {
- .owner = THIS_MODULE,
+const struct rxrpc_security rxkad = {
.name = "rxkad",
.security_index = RXRPC_SECURITY_RXKAD,
+ .init = rxkad_init,
+ .exit = rxkad_exit,
.init_connection_security = rxkad_init_connection_security,
.prime_packet_security = rxkad_prime_packet_security,
.secure_packet = rxkad_secure_packet,
@@ -1179,28 +1191,3 @@ static struct rxrpc_security rxkad = {
.verify_response = rxkad_verify_response,
.clear = rxkad_clear,
};
-
-static __init int rxkad_init(void)
-{
- _enter("");
-
- /* pin the cipher we need so that the crypto layer doesn't invoke
- * keventd to go get it */
- rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(rxkad_ci))
- return PTR_ERR(rxkad_ci);
-
- return rxrpc_register_security(&rxkad);
-}
-
-module_init(rxkad_init);
-
-static __exit void rxkad_exit(void)
-{
- _enter("");
-
- rxrpc_unregister_security(&rxkad);
- crypto_free_skcipher(rxkad_ci);
-}
-
-module_exit(rxkad_exit);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 96066665e..c7a0b0d48 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -657,12 +657,15 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (compat_mode) {
if (a->type == TCA_OLD_COMPAT)
err = gnet_stats_start_copy_compat(skb, 0,
- TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);
+ TCA_STATS,
+ TCA_XSTATS,
+ &p->tcfc_lock, &d,
+ TCA_PAD);
else
return 0;
} else
err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
- &p->tcfc_lock, &d);
+ &p->tcfc_lock, &d, TCA_ACT_PAD);
if (err < 0)
goto errout;
@@ -1115,7 +1118,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
nla_nest_end(skb, nest);
ret = skb->len;
} else
- nla_nest_cancel(skb, nest);
+ nlmsg_trim(skb, b);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
if (NETLINK_CB(cb->skb).portid && ret)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 8c9f1f045..c7123e01c 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -53,9 +53,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
filter = rcu_dereference(prog->filter);
if (at_ingress) {
__skb_push(skb, skb->mac_len);
+ bpf_compute_data_end(skb);
filter_res = BPF_PROG_RUN(filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
+ bpf_compute_data_end(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
rcu_read_unlock();
@@ -156,7 +158,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse);
tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires);
- if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm))
+ if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
+ TCA_ACT_BPF_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index c0ed93ce2..2ba700c76 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -163,7 +163,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
- if (nla_put(skb, TCA_CONNMARK_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
+ TCA_CONNMARK_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d22426cde..28e934ed0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -549,7 +549,7 @@ static int tcf_csum_dump(struct sk_buff *skb,
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
- if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 887fc1f20..ec5cc8435 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -148,6 +148,20 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
return action;
}
+static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse)
+{
+ struct tcf_gact *gact = a->priv;
+ int action = READ_ONCE(gact->tcf_action);
+ struct tcf_t *tm = &gact->tcf_tm;
+
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets);
+ if (action == TC_ACT_SHOT)
+ this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
+
+ tm->lastuse = lastuse;
+}
+
static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
@@ -177,7 +191,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
- if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
goto nla_put_failure;
return skb->len;
@@ -207,6 +221,7 @@ static struct tc_action_ops act_gact_ops = {
.type = TCA_ACT_GACT,
.owner = THIS_MODULE,
.act = tcf_gact,
+ .stats_update = tcf_gact_stats_update,
.dump = tcf_gact_dump,
.init = tcf_gact_init,
.walk = tcf_gact_walker,
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 343d011aa..ea4a2fef1 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -106,9 +106,9 @@ int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
}
EXPORT_SYMBOL_GPL(ife_get_meta_u16);
-int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
{
- mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u32), gfp);
if (!mi->metaval)
return -ENOMEM;
@@ -116,9 +116,9 @@ int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
}
EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
-int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
{
- mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u16), gfp);
if (!mi->metaval)
return -ENOMEM;
@@ -240,10 +240,10 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
}
/* called when adding new meta information
- * under ife->tcf_lock
+ * under ife->tcf_lock for existing action
*/
static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
- void *val, int len)
+ void *val, int len, bool exists)
{
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
@@ -251,11 +251,13 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
if (!ops) {
ret = -ENOENT;
#ifdef CONFIG_MODULES
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
rtnl_unlock();
request_module("ifemeta%u", metaid);
rtnl_lock();
- spin_lock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ops = find_ife_oplist(metaid);
#endif
}
@@ -272,10 +274,10 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
}
/* called when adding new meta information
- * under ife->tcf_lock
+ * under ife->tcf_lock for existing action
*/
static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
- int len)
+ int len, bool atomic)
{
struct tcf_meta_info *mi = NULL;
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
@@ -284,7 +286,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
if (!ops)
return -ENOENT;
- mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+ mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL);
if (!mi) {
/*put back what find_ife_oplist took */
module_put(ops->owner);
@@ -294,7 +296,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
mi->metaid = metaid;
mi->ops = ops;
if (len > 0) {
- ret = ops->alloc(mi, metaval);
+ ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL);
if (ret != 0) {
kfree(mi);
module_put(ops->owner);
@@ -313,11 +315,13 @@ static int use_all_metadata(struct tcf_ife_info *ife)
int rc = 0;
int installed = 0;
+ read_lock(&ife_mod_lock);
list_for_each_entry(o, &ifeoplist, list) {
- rc = add_metainfo(ife, o->metaid, NULL, 0);
+ rc = add_metainfo(ife, o->metaid, NULL, 0, true);
if (rc == 0)
installed += 1;
}
+ read_unlock(&ife_mod_lock);
if (installed)
return 0;
@@ -385,8 +389,9 @@ static void tcf_ife_cleanup(struct tc_action *a, int bind)
spin_unlock_bh(&ife->tcf_lock);
}
-/* under ife->tcf_lock */
-static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+/* under ife->tcf_lock for existing action */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
+ bool exists)
{
int len = 0;
int rc = 0;
@@ -398,11 +403,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
val = nla_data(tb[i]);
len = nla_len(tb[i]);
- rc = load_metaops_and_vet(ife, i, val, len);
+ rc = load_metaops_and_vet(ife, i, val, len, exists);
if (rc != 0)
return rc;
- rc = add_metainfo(ife, i, val, len);
+ rc = add_metainfo(ife, i, val, len, exists);
if (rc)
return rc;
}
@@ -474,7 +479,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
saddr = nla_data(tb[TCA_IFE_SMAC]);
}
- spin_lock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ife->tcf_action = parm->action;
if (parm->flags & IFE_ENCODE) {
@@ -504,11 +510,12 @@ metadata_parse_err:
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
return err;
}
- err = populate_metalist(ife, tb2);
+ err = populate_metalist(ife, tb2, exists);
if (err)
goto metadata_parse_err;
@@ -523,12 +530,14 @@ metadata_parse_err:
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
return err;
}
}
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(tn, a);
@@ -556,7 +565,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
- if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD))
goto nla_put_failure;
if (!is_zero_ether_addr(ife->eth_dst)) {
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 606323339..d4bd19ee5 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -283,7 +283,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
- if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
+ if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
goto nla_put_failure;
kfree(t);
return skb->len;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 934336e12..1f5bd6ccb 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -36,14 +36,15 @@ static DEFINE_SPINLOCK(mirred_list_lock);
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
- struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1);
+ struct net_device *dev;
/* We could be called either in a RCU callback or with RTNL lock held. */
spin_lock_bh(&mirred_list_lock);
list_del(&m->tcfm_list);
- spin_unlock_bh(&mirred_list_lock);
+ dev = rcu_dereference_protected(m->tcfm_dev, 1);
if (dev)
dev_put(dev);
+ spin_unlock_bh(&mirred_list_lock);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -221,7 +222,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
- if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 0f65cdfbf..c0a879f94 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -267,7 +267,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
- if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 429c3ab65..c6e18f230 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -203,7 +203,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
- if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
kfree(opt);
return skb->len;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 330f14e30..c55778976 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -38,7 +38,7 @@ struct tcf_police {
bool peak_present;
};
#define to_police(pc) \
- container_of(pc, struct tcf_police, common)
+ container_of(pc->priv, struct tcf_police, common)
#define POL_TAB_MASK 15
@@ -119,14 +119,12 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
- unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
- struct tcf_hashinfo *hinfo = tn->hinfo;
int size;
if (nla == NULL)
@@ -145,7 +143,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
if (parm->index) {
if (tcf_hash_search(tn, a, parm->index)) {
- police = to_police(a->priv);
+ police = to_police(a);
if (bind) {
police->tcf_bindcnt += 1;
police->tcf_refcnt += 1;
@@ -156,16 +154,15 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
/* not replacing */
return -EEXIST;
}
+ } else {
+ ret = tcf_hash_create(tn, parm->index, NULL, a,
+ sizeof(*police), bind, false);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
}
- police = kzalloc(sizeof(*police), GFP_KERNEL);
- if (police == NULL)
- return -ENOMEM;
- ret = ACT_P_CREATED;
- police->tcf_refcnt = 1;
- spin_lock_init(&police->tcf_lock);
- if (bind)
- police->tcf_bindcnt = 1;
+ police = to_police(a);
override:
if (parm->rate.rate) {
err = -ENOMEM;
@@ -237,14 +234,8 @@ override:
return ret;
police->tcfp_t_c = ktime_get_ns();
- police->tcf_index = parm->index ? parm->index :
- tcf_hash_new_index(tn);
- h = tcf_hash(police->tcf_index, POL_TAB_MASK);
- spin_lock_bh(&hinfo->lock);
- hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
- spin_unlock_bh(&hinfo->lock);
+ tcf_hash_insert(tn, a);
- a->priv = police;
return ret;
failure_unlock:
@@ -253,7 +244,7 @@ failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
- kfree(police);
+ tcf_hash_cleanup(a, est);
return err;
}
@@ -268,6 +259,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
spin_lock(&police->tcf_lock);
bstats_update(&police->tcf_bstats, skb);
+ tcf_lastuse_update(&police->tcf_tm);
if (police->tcfp_ewma_rate &&
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
@@ -327,6 +319,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
.refcnt = police->tcf_refcnt - ref,
.bindcnt = police->tcf_bindcnt - bind,
};
+ struct tcf_t t;
if (police->rate_present)
psched_ratecfg_getrate(&opt.rate, &police->rate);
@@ -340,6 +333,13 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
if (police->tcfp_ewma_rate &&
nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
goto nla_put_failure;
+
+ t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
+ goto nla_put_failure;
+
return skb->len;
nla_put_failure:
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 3a33fb648..e42f8daca 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -161,7 +161,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
- if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 69da5a8f0..e92880296 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -171,7 +171,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
- if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index c45f926da..ac4adc812 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -185,7 +185,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(v->tcf_tm.expires);
- if (nla_put(skb, TCA_VLAN_TM, sizeof(t), &t))
+ if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 425fe6a0e..7b342c779 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -96,9 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
+ bpf_compute_data_end(skb);
filter_res = BPF_PROG_RUN(prog->filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
+ bpf_compute_data_end(skb);
filter_res = BPF_PROG_RUN(prog->filter, skb);
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2181ffc76..b3b7978f4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -171,7 +171,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, 0))
+ if (!tc_should_offload(dev, tp, 0))
return;
offload.command = TC_CLSFLOWER_DESTROY;
@@ -194,7 +194,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, flags))
+ if (!tc_should_offload(dev, tp, flags))
return;
offload.command = TC_CLSFLOWER_REPLACE;
@@ -210,6 +210,25 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
}
+static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_flower_offload offload = {0};
+ struct tc_to_netdev tc;
+
+ if (!tc_should_offload(dev, tp, 0))
+ return;
+
+ offload.command = TC_CLSFLOWER_STATS;
+ offload.cookie = (unsigned long)f;
+ offload.exts = &f->exts;
+
+ tc.type = TC_SETUP_CLSFLOWER;
+ tc.cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
static bool fl_destroy(struct tcf_proto *tp, bool force)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -662,6 +681,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
}
+ fl_hw_update_stats(tp, f);
+
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
sizeof(key->eth.dst)) ||
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 563cdad76..ffe593efe 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -134,6 +134,11 @@ next_knode:
j = 0;
#endif
+ if (tc_skip_sw(n->flags)) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+
#ifdef CONFIG_CLS_U32_MARK
if ((skb->mark & n->mask) != n->val) {
n = rcu_dereference_bh(n->next);
@@ -435,7 +440,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, 0)) {
+ if (tc_should_offload(dev, tp, 0)) {
offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
offload.cls_u32->knode.handle = handle;
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -443,26 +448,32 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
}
}
-static void u32_replace_hw_hnode(struct tcf_proto *tp,
+static int u32_replace_hw_hnode(struct tcf_proto *tp,
struct tc_u_hnode *h,
u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
struct tc_to_netdev offload;
+ int err;
+
+ if (!tc_should_offload(dev, tp, flags))
+ return tc_skip_sw(flags) ? -EINVAL : 0;
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, flags)) {
- offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
- offload.cls_u32->hnode.divisor = h->divisor;
- offload.cls_u32->hnode.handle = h->handle;
- offload.cls_u32->hnode.prio = h->prio;
+ offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+ offload.cls_u32->hnode.divisor = h->divisor;
+ offload.cls_u32->hnode.handle = h->handle;
+ offload.cls_u32->hnode.prio = h->prio;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
- }
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ if (tc_skip_sw(flags))
+ return err;
+
+ return 0;
}
static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
@@ -474,7 +485,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, 0)) {
+ if (tc_should_offload(dev, tp, 0)) {
offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
offload.cls_u32->hnode.divisor = h->divisor;
offload.cls_u32->hnode.handle = h->handle;
@@ -485,36 +496,42 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
}
}
-static void u32_replace_hw_knode(struct tcf_proto *tp,
+static int u32_replace_hw_knode(struct tcf_proto *tp,
struct tc_u_knode *n,
u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
struct tc_to_netdev offload;
+ int err;
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, flags)) {
- offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
- offload.cls_u32->knode.handle = n->handle;
- offload.cls_u32->knode.fshift = n->fshift;
+ if (!tc_should_offload(dev, tp, flags))
+ return tc_skip_sw(flags) ? -EINVAL : 0;
+
+ offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+ offload.cls_u32->knode.handle = n->handle;
+ offload.cls_u32->knode.fshift = n->fshift;
#ifdef CONFIG_CLS_U32_MARK
- offload.cls_u32->knode.val = n->val;
- offload.cls_u32->knode.mask = n->mask;
+ offload.cls_u32->knode.val = n->val;
+ offload.cls_u32->knode.mask = n->mask;
#else
- offload.cls_u32->knode.val = 0;
- offload.cls_u32->knode.mask = 0;
+ offload.cls_u32->knode.val = 0;
+ offload.cls_u32->knode.mask = 0;
#endif
- offload.cls_u32->knode.sel = &n->sel;
- offload.cls_u32->knode.exts = &n->exts;
- if (n->ht_down)
- offload.cls_u32->knode.link_handle = n->ht_down->handle;
+ offload.cls_u32->knode.sel = &n->sel;
+ offload.cls_u32->knode.exts = &n->exts;
+ if (n->ht_down)
+ offload.cls_u32->knode.link_handle = n->ht_down->handle;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
- }
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ if (tc_skip_sw(flags))
+ return err;
+
+ return 0;
}
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
@@ -845,8 +862,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
- if (tb[TCA_U32_FLAGS])
+ if (tb[TCA_U32_FLAGS]) {
flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+ if (!tc_flags_valid(flags))
+ return -EINVAL;
+ }
n = (struct tc_u_knode *)*arg;
if (n) {
@@ -871,10 +891,15 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return err;
}
+ err = u32_replace_hw_knode(tp, new, flags);
+ if (err) {
+ u32_destroy_key(tp, new, false);
+ return err;
+ }
+
u32_replace_knode(tp, tp_c, new);
tcf_unbind_filter(tp, &n->res);
call_rcu(&n->rcu, u32_delete_key_rcu);
- u32_replace_hw_knode(tp, new, flags);
return 0;
}
@@ -898,11 +923,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
ht->divisor = divisor;
ht->handle = handle;
ht->prio = tp->prio;
+
+ err = u32_replace_hw_hnode(tp, ht, flags);
+ if (err) {
+ kfree(ht);
+ return err;
+ }
+
RCU_INIT_POINTER(ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, ht);
*arg = (unsigned long)ht;
- u32_replace_hw_hnode(tp, ht, flags);
return 0;
}
@@ -978,6 +1009,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tc_u_knode __rcu **ins;
struct tc_u_knode *pins;
+ err = u32_replace_hw_knode(tp, n, flags);
+ if (err)
+ goto errhw;
+
ins = &ht->ht[TC_U32_HASH(handle)];
for (pins = rtnl_dereference(*ins); pins;
ins = &pins->next, pins = rtnl_dereference(*ins))
@@ -986,11 +1021,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
RCU_INIT_POINTER(n->next, pins);
rcu_assign_pointer(*ins, n);
- u32_replace_hw_knode(tp, n, flags);
*arg = (unsigned long)n;
return 0;
}
+errhw:
#ifdef CONFIG_CLS_U32_MARK
free_percpu(n->pcpu_success);
errout:
@@ -1140,9 +1175,10 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
gpf->kcnts[i] += pf->kcnts[i];
}
- if (nla_put(skb, TCA_U32_PCNT,
- sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
- gpf)) {
+ if (nla_put_64bit(skb, TCA_U32_PCNT,
+ sizeof(struct tc_u32_pcnt) +
+ n->sel.nkeys * sizeof(u64),
+ gpf, TCA_U32_PAD)) {
kfree(gpf);
goto nla_put_failure;
}
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index f2aabc008..a309a07cc 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -796,7 +796,7 @@ struct meta_type_ops {
int (*dump)(struct sk_buff *, struct meta_value *, int);
};
-static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
+static const struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
.destroy = meta_var_destroy,
.compare = meta_var_compare,
@@ -812,7 +812,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
}
};
-static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
+static inline const struct meta_type_ops *meta_type_ops(struct meta_value *v)
{
return &__meta_type_ops[meta_type(v)];
}
@@ -870,7 +870,7 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
static void meta_delete(struct meta_match *meta)
{
if (meta) {
- struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+ const struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
if (ops && ops->destroy) {
ops->destroy(&meta->lvalue);
@@ -964,7 +964,7 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
{
struct meta_match *meta = (struct meta_match *) em->data;
struct tcf_meta_hdr hdr;
- struct meta_type_ops *ops;
+ const struct meta_type_ops *ops;
memset(&hdr, 0, sizeof(hdr));
memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3b180ff72..ddf047df5 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -607,6 +607,10 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool thr
if (throttle)
qdisc_throttled(wd->qdisc);
+ if (wd->last_expires == expires)
+ return;
+
+ wd->last_expires = expires;
hrtimer_start(&wd->timer,
ns_to_ktime(expires),
HRTIMER_MODE_ABS_PINNED);
@@ -1365,7 +1369,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
- qdisc_root_sleeping_lock(q), &d) < 0)
+ qdisc_root_sleeping_lock(q), &d,
+ TCA_PAD) < 0)
goto nla_put_failure;
if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
@@ -1679,7 +1684,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
- qdisc_root_sleeping_lock(q), &d) < 0)
+ qdisc_root_sleeping_lock(q), &d,
+ TCA_PAD) < 0)
goto nla_put_failure;
if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 9b7e2980e..dddf3bb65 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -49,6 +49,8 @@
#include <linux/prefetch.h>
#include <net/pkt_sched.h>
#include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
#define DEFAULT_CODEL_LIMIT 1000
@@ -64,20 +66,33 @@ struct codel_sched_data {
* to dequeue a packet from queue. Note: backlog is handled in
* codel, we dont need to reduce it here.
*/
-static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
{
+ struct Qdisc *sch = ctx;
struct sk_buff *skb = __skb_dequeue(&sch->q);
+ if (skb)
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+
prefetch(&skb->end); /* we'll need skb_shinfo() */
return skb;
}
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+ struct Qdisc *sch = ctx;
+
+ qdisc_drop(skb, sch);
+}
+
static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
{
struct codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+ skb = codel_dequeue(sch, &sch->qstats.backlog, &q->params, &q->vars,
+ &q->stats, qdisc_pkt_len, codel_get_enqueue_time,
+ drop_func, dequeue_func);
/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
* or HTB crashes. Defer it for next round.
@@ -173,9 +188,10 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt)
sch->limit = DEFAULT_CODEL_LIMIT;
- codel_params_init(&q->params, sch);
+ codel_params_init(&q->params);
codel_vars_init(&q->vars);
codel_stats_init(&q->stats);
+ q->params.mtu = psched_mtu(qdisc_dev(sch));
if (opt) {
int err = codel_change(sch, opt);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a63e879e8..bf8af2c43 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -375,6 +375,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->deficit = cl->quantum;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return err;
}
@@ -407,6 +408,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
bstats_update(&cl->bstats, skb);
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -428,6 +430,7 @@ static unsigned int drr_drop(struct Qdisc *sch)
if (cl->qdisc->ops->drop) {
len = cl->qdisc->ops->drop(cl->qdisc);
if (len > 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
@@ -463,6 +466,7 @@ static void drr_reset_qdisc(struct Qdisc *sch)
qdisc_reset(cl->qdisc);
}
}
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d3fc8f9dd..da250b2e0 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -24,6 +24,8 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
/* Fair Queue CoDel.
*
@@ -57,8 +59,12 @@ struct fq_codel_sched_data {
u32 flows_cnt; /* number of flows */
u32 perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ u32 drop_batch_size;
+ u32 memory_limit;
struct codel_params cparams;
struct codel_stats cstats;
+ u32 memory_usage;
+ u32 drop_overmemory;
u32 drop_overlimit;
u32 new_flow_count;
@@ -133,17 +139,21 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
skb->next = NULL;
}
-static unsigned int fq_codel_drop(struct Qdisc *sch)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
unsigned int maxbacklog = 0, idx = 0, i, len;
struct fq_codel_flow *flow;
+ unsigned int threshold;
+ unsigned int mem = 0;
- /* Queue is full! Find the fat flow and drop packet from it.
+ /* Queue is full! Find the fat flow and drop packet(s) from it.
* This might sound expensive, but with 1024 flows, we scan
* 4KB of memory, and we dont need to handle a complex tree
* in fast path (packet queue/enqueue) with many cache misses.
+ * In stress mode, we'll try to drop 64 packets from the flow,
+ * amortizing this linear lookup to one cache line per drop.
*/
for (i = 0; i < q->flows_cnt; i++) {
if (q->backlogs[i] > maxbacklog) {
@@ -151,15 +161,26 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
idx = i;
}
}
+
+ /* Our goal is to drop half of this fat flow backlog */
+ threshold = maxbacklog >> 1;
+
flow = &q->flows[idx];
- skb = dequeue_head(flow);
- len = qdisc_pkt_len(skb);
+ len = 0;
+ i = 0;
+ do {
+ skb = dequeue_head(flow);
+ len += qdisc_pkt_len(skb);
+ mem += skb->truesize;
+ kfree_skb(skb);
+ } while (++i < max_packets && len < threshold);
+
+ flow->dropped += i;
q->backlogs[idx] -= len;
- sch->q.qlen--;
- qdisc_qstats_drop(sch);
- qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
- flow->dropped++;
+ q->memory_usage -= mem;
+ sch->qstats.drops += i;
+ sch->qstats.backlog -= len;
+ sch->q.qlen -= i;
return idx;
}
@@ -168,16 +189,18 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
unsigned int prev_backlog;
prev_backlog = sch->qstats.backlog;
- fq_codel_drop(sch);
+ fq_codel_drop(sch, 1U);
return prev_backlog - sch->qstats.backlog;
}
static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
- unsigned int idx, prev_backlog;
+ unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
+ unsigned int pkt_len;
+ bool memory_limited;
idx = fq_codel_classify(skb, sch, &ret);
if (idx == 0) {
@@ -200,19 +223,39 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
flow->deficit = q->quantum;
flow->dropped = 0;
}
- if (++sch->q.qlen <= sch->limit)
+ q->memory_usage += skb->truesize;
+ memory_limited = q->memory_usage > q->memory_limit;
+ if (++sch->q.qlen <= sch->limit && !memory_limited)
return NET_XMIT_SUCCESS;
prev_backlog = sch->qstats.backlog;
- q->drop_overlimit++;
- /* Return Congestion Notification only if we dropped a packet
- * from this flow.
+ prev_qlen = sch->q.qlen;
+
+ /* save this packet length as it might be dropped by fq_codel_drop() */
+ pkt_len = qdisc_pkt_len(skb);
+ /* fq_codel_drop() is quite expensive, as it performs a linear search
+ * in q->backlogs[] to find a fat flow.
+ * So instead of dropping a single packet, drop half of its backlog
+ * with a 64 packets limit to not add a too big cpu spike here.
*/
- if (fq_codel_drop(sch) == idx)
- return NET_XMIT_CN;
+ ret = fq_codel_drop(sch, q->drop_batch_size);
- /* As we dropped a packet, better let upper stack know this */
- qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
+ prev_qlen -= sch->q.qlen;
+ prev_backlog -= sch->qstats.backlog;
+ q->drop_overlimit += prev_qlen;
+ if (memory_limited)
+ q->drop_overmemory += prev_qlen;
+
+ /* As we dropped packet(s), better let upper stack know this.
+ * If we dropped a packet for this flow, return NET_XMIT_CN,
+ * but in this case, our parents wont increase their backlogs.
+ */
+ if (ret == idx) {
+ qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
+ prev_backlog - pkt_len);
+ return NET_XMIT_CN;
+ }
+ qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
return NET_XMIT_SUCCESS;
}
@@ -220,8 +263,9 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
* to dequeue a packet from queue. Note: backlog is handled in
* codel, we dont need to reduce it here.
*/
-static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
{
+ struct Qdisc *sch = ctx;
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct fq_codel_flow *flow;
struct sk_buff *skb = NULL;
@@ -230,11 +274,20 @@ static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
if (flow->head) {
skb = dequeue_head(flow);
q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+ q->memory_usage -= skb->truesize;
sch->q.qlen--;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
}
return skb;
}
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+ struct Qdisc *sch = ctx;
+
+ qdisc_drop(skb, sch);
+}
+
static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -263,8 +316,9 @@ begin:
prev_ecn_mark = q->cstats.ecn_mark;
prev_backlog = sch->qstats.backlog;
- skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
- dequeue);
+ skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
+ &flow->cvars, &q->cstats, qdisc_pkt_len,
+ codel_get_enqueue_time, drop_func, dequeue_func);
flow->dropped += q->cstats.drop_count - prev_drop_count;
flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
@@ -313,6 +367,7 @@ static void fq_codel_reset(struct Qdisc *sch)
}
memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
sch->q.qlen = 0;
+ q->memory_usage = 0;
}
static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
@@ -323,6 +378,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
[TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
[TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 },
};
static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -374,7 +431,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_CODEL_QUANTUM])
q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
- while (sch->q.qlen > sch->limit) {
+ if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
+ q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+
+ if (tb[TCA_FQ_CODEL_MEMORY_LIMIT])
+ q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]));
+
+ while (sch->q.qlen > sch->limit ||
+ q->memory_usage > q->memory_limit) {
struct sk_buff *skb = fq_codel_dequeue(sch);
q->cstats.drop_len += qdisc_pkt_len(skb);
@@ -419,13 +483,16 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
sch->limit = 10*1024;
q->flows_cnt = 1024;
+ q->memory_limit = 32 << 20; /* 32 MBytes */
+ q->drop_batch_size = 64;
q->quantum = psched_mtu(qdisc_dev(sch));
q->perturbation = prandom_u32();
INIT_LIST_HEAD(&q->new_flows);
INIT_LIST_HEAD(&q->old_flows);
- codel_params_init(&q->cparams, sch);
+ codel_params_init(&q->cparams);
codel_stats_init(&q->cstats);
q->cparams.ecn = true;
+ q->cparams.mtu = psched_mtu(qdisc_dev(sch));
if (opt) {
int err = fq_codel_change(sch, opt);
@@ -476,6 +543,10 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
q->cparams.ecn) ||
nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
+ q->drop_batch_size) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT,
+ q->memory_limit) ||
nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
q->flows_cnt))
goto nla_put_failure;
@@ -504,6 +575,8 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
st.qdisc_stats.new_flow_count = q->new_flow_count;
st.qdisc_stats.ce_mark = q->cstats.ce_mark;
+ st.qdisc_stats.memory_usage = q->memory_usage;
+ st.qdisc_stats.drop_overmemory = q->drop_overmemory;
list_for_each(pos, &q->new_flows)
st.qdisc_stats.new_flows_len++;
@@ -588,7 +661,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
qs.backlog = q->backlogs[idx];
qs.drops = flow->dropped;
}
- if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
if (idx < q->flows_cnt)
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 80742edea..f9e0e9c03 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -49,6 +49,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
q->gso_skb = skb;
q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
q->q.qlen++; /* it's still part of the queue */
__netif_schedule(q);
@@ -92,6 +93,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
+ qdisc_qstats_backlog_dec(q, skb);
q->q.qlen--;
} else
skb = NULL;
@@ -108,35 +110,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
return skb;
}
-static inline int handle_dev_cpu_collision(struct sk_buff *skb,
- struct netdev_queue *dev_queue,
- struct Qdisc *q)
-{
- int ret;
-
- if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
- /*
- * Same CPU holding the lock. It may be a transient
- * configuration error, when hard_start_xmit() recurses. We
- * detect it by checking xmit owner and drop the packet when
- * deadloop is detected. Return OK to try the next skb.
- */
- kfree_skb_list(skb);
- net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
- dev_queue->dev->name);
- ret = qdisc_qlen(q);
- } else {
- /*
- * Another cpu is holding lock, requeue & delay xmits for
- * some time.
- */
- __this_cpu_inc(softnet_data.cpu_collision);
- ret = dev_requeue_skb(skb, q);
- }
-
- return ret;
-}
-
/*
* Transmit possibly several skbs, and handle the return status as
* required. Holding the __QDISC___STATE_RUNNING bit guarantees that
@@ -174,9 +147,6 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
if (dev_xmit_complete(ret)) {
/* Driver sent out skb successfully or skb was consumed */
ret = qdisc_qlen(q);
- } else if (ret == NETDEV_TX_LOCKED) {
- /* Driver try lock failed */
- ret = handle_dev_cpu_collision(skb, txq, q);
} else {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely(ret != NETDEV_TX_BUSY))
@@ -259,13 +229,12 @@ unsigned long dev_trans_start(struct net_device *dev)
if (is_vlan_dev(dev))
dev = vlan_dev_real_dev(dev);
- res = dev->trans_start;
- for (i = 0; i < dev->num_tx_queues; i++) {
+ res = netdev_get_tx_queue(dev, 0)->trans_start;
+ for (i = 1; i < dev->num_tx_queues; i++) {
val = netdev_get_tx_queue(dev, i)->trans_start;
if (val && time_after(val, res))
res = val;
}
- dev->trans_start = res;
return res;
}
@@ -288,10 +257,7 @@ static void dev_watchdog(unsigned long arg)
struct netdev_queue *txq;
txq = netdev_get_tx_queue(dev, i);
- /*
- * old device drivers set dev->trans_start
- */
- trans_start = txq->trans_start ? : dev->trans_start;
+ trans_start = txq->trans_start;
if (netif_xmit_stopped(txq) &&
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
@@ -807,7 +773,7 @@ void dev_activate(struct net_device *dev)
transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
if (need_watchdog) {
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
dev_watchdog_up(dev);
}
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index d783d7cc3..1ac9f9f03 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1529,6 +1529,7 @@ hfsc_reset_qdisc(struct Qdisc *sch)
q->eligible = RB_ROOT;
INIT_LIST_HEAD(&q->droplist);
qdisc_watchdog_cancel(&q->watchdog);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
@@ -1559,14 +1560,6 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
struct hfsc_sched *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_hfsc_qopt qopt;
- struct hfsc_class *cl;
- unsigned int i;
-
- sch->qstats.backlog = 0;
- for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
- sch->qstats.backlog += cl->qdisc->qstats.backlog;
- }
qopt.defcls = q->defcls;
if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
@@ -1604,6 +1597,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl->qdisc->q.qlen == 1)
set_active(cl, qdisc_pkt_len(skb));
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1672,6 +1666,7 @@ hfsc_dequeue(struct Qdisc *sch)
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
@@ -1695,6 +1690,7 @@ hfsc_drop(struct Qdisc *sch)
}
cl->qstats.drops++;
qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 87b02ed3d..052f84d6c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -928,17 +928,10 @@ ok:
}
}
qdisc_qstats_overlimit(sch);
- if (likely(next_event > q->now)) {
- if (!test_bit(__QDISC_STATE_DEACTIVATED,
- &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
- ktime_t time = ns_to_ktime(next_event);
- qdisc_throttled(q->watchdog.qdisc);
- hrtimer_start(&q->watchdog.timer, time,
- HRTIMER_MODE_ABS_PINNED);
- }
- } else {
+ if (likely(next_event > q->now))
+ qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true);
+ else
schedule_work(&q->work);
- }
fin:
return skb;
}
@@ -1014,7 +1007,9 @@ static void htb_work_func(struct work_struct *work)
struct htb_sched *q = container_of(work, struct htb_sched, work);
struct Qdisc *sch = q->watchdog.qdisc;
+ rcu_read_lock();
__netif_schedule(qdisc_root(sch));
+ rcu_read_unlock();
}
static int htb_init(struct Qdisc *sch, struct nlattr *opt)
@@ -1122,10 +1117,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
- nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+ nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
+ TCA_HTB_PAD))
goto nla_put_failure;
if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
- nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+ nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
+ TCA_HTB_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -1143,8 +1140,10 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (!cl->level && cl->un.leaf.q)
qlen = cl->un.leaf.q->q.qlen;
- cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
- cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
+ cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
+ INT_MIN, INT_MAX);
+ cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
+ INT_MIN, INT_MAX);
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 10adbc617..8fe6999b6 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -27,6 +27,11 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
return TC_H_MIN(classid) + 1;
}
+static bool ingress_cl_offload(u32 classid)
+{
+ return true;
+}
+
static unsigned long ingress_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
@@ -86,6 +91,7 @@ static const struct Qdisc_class_ops ingress_class_ops = {
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = ingress_find_tcf,
+ .tcf_cl_offload = ingress_cl_offload,
.bind_tcf = ingress_bind_filter,
.unbind_tcf = ingress_put,
};
@@ -110,6 +116,11 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
}
}
+static bool clsact_cl_offload(u32 classid)
+{
+ return TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS);
+}
+
static unsigned long clsact_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
@@ -158,6 +169,7 @@ static const struct Qdisc_class_ops clsact_class_ops = {
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = clsact_find_tcf,
+ .tcf_cl_offload = clsact_cl_offload,
.bind_tcf = clsact_bind_filter,
.unbind_tcf = ingress_put,
};
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b7c29d5b6..178f1630a 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1051,7 +1051,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
if (q->rate >= (1ULL << 32)) {
- if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+ if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
+ TCA_NETEM_PAD))
goto nla_put_failure;
rate.rate = ~0U;
} else {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fee1b1550..a356450b7 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -85,6 +85,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -117,6 +118,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -135,6 +137,7 @@ static unsigned int prio_drop(struct Qdisc *sch)
for (prio = q->bands-1; prio >= 0; prio--) {
qdisc = q->queues[prio];
if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
@@ -151,6 +154,7 @@ prio_reset(struct Qdisc *sch)
for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
@@ -168,8 +172,9 @@ prio_destroy(struct Qdisc *sch)
static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *queues[TCQ_PRIO_BANDS];
+ int oldbands = q->bands, i;
struct tc_prio_qopt *qopt;
- int i;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
@@ -183,62 +188,42 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
return -EINVAL;
}
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < qopt->bands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1));
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_destroy(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
sch_tree_lock(sch);
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
+ for (i = q->bands; i < oldbands; i++) {
struct Qdisc *child = q->queues[i];
- q->queues[i] = &noop_qdisc;
- if (child != &noop_qdisc) {
- qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog);
- qdisc_destroy(child);
- }
- }
- sch_tree_unlock(sch);
- for (i = 0; i < q->bands; i++) {
- if (q->queues[i] == &noop_qdisc) {
- struct Qdisc *child, *old;
-
- child = qdisc_create_dflt(sch->dev_queue,
- &pfifo_qdisc_ops,
- TC_H_MAKE(sch->handle, i + 1));
- if (child) {
- sch_tree_lock(sch);
- old = q->queues[i];
- q->queues[i] = child;
-
- if (old != &noop_qdisc) {
- qdisc_tree_reduce_backlog(old,
- old->q.qlen,
- old->qstats.backlog);
- qdisc_destroy(old);
- }
- sch_tree_unlock(sch);
- }
- }
+ qdisc_tree_reduce_backlog(child, child->q.qlen,
+ child->qstats.backlog);
+ qdisc_destroy(child);
}
+
+ for (i = oldbands; i < q->bands; i++)
+ q->queues[i] = queues[i];
+
+ sch_tree_unlock(sch);
return 0;
}
static int prio_init(struct Qdisc *sch, struct nlattr *opt)
{
- struct prio_sched_data *q = qdisc_priv(sch);
- int i;
-
- for (i = 0; i < TCQ_PRIO_BANDS; i++)
- q->queues[i] = &noop_qdisc;
-
- if (opt == NULL) {
+ if (!opt)
return -EINVAL;
- } else {
- int err;
- if ((err = prio_tune(sch, opt)) != 0)
- return err;
- }
- return 0;
+ return prio_tune(sch, opt);
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8d2d8d953..f18857feb 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1235,8 +1235,10 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
err = qfq_change_agg(sch, cl, cl->agg->class_weight,
qdisc_pkt_len(skb));
- if (err)
- return err;
+ if (err) {
+ cl->qstats.drops++;
+ return qdisc_drop(skb, sch);
+ }
}
err = qdisc_enqueue(skb, cl->qdisc);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 8c0508c0e..91578bdd3 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -97,6 +97,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -118,6 +119,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
skb = child->dequeue(child);
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
} else {
if (!red_is_idling(&q->vars))
@@ -143,6 +145,7 @@ static unsigned int red_drop(struct Qdisc *sch)
if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
q->stats.other++;
qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
@@ -158,6 +161,7 @@ static void red_reset(struct Qdisc *sch)
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
red_restart(&q->vars);
}
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index c2fbde742..3161e4919 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -207,6 +207,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -217,6 +218,7 @@ static unsigned int tbf_drop(struct Qdisc *sch)
unsigned int len = 0;
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
qdisc_qstats_drop(sch);
}
@@ -263,6 +265,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
q->t_c = now;
q->tokens = toks;
q->ptokens = ptoks;
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
@@ -294,6 +297,7 @@ static void tbf_reset(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
q->t_c = ktime_get_ns();
q->tokens = q->buffer;
@@ -472,11 +476,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
- nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+ nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
+ TCA_TBF_PAD))
goto nla_put_failure;
if (tbf_peak_present(q) &&
q->peak.rate_bytes_ps >= (1ULL << 32) &&
- nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+ nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
+ TCA_TBF_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 71c1a598d..d9c04dc1b 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -99,5 +99,9 @@ config SCTP_COOKIE_HMAC_SHA1
select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
+config INET_SCTP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
endif # IP_SCTP
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 3b4ffb021..0fca5824a 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,6 +4,7 @@
obj-$(CONFIG_IP_SCTP) += sctp.o
obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
+obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
protocol.o endpointola.o associola.o \
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 958ef5f33..1eb94bf18 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -239,7 +239,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
offset = 0;
if ((whole > 1) || (whole && over))
- SCTP_INC_STATS_USER(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+ SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
/* Create chunks for all the full sized DATA chunks. */
for (i = 0, len = first_len; i < whole; i++) {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 00b844536..f09332256 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -84,7 +84,7 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb)
if (val != cmp) {
/* CRC failure, dump it. */
- SCTP_INC_STATS_BH(net, SCTP_MIB_CHECKSUMERRORS);
+ __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS);
return -1;
}
return 0;
@@ -112,7 +112,6 @@ int sctp_rcv(struct sk_buff *skb)
struct sctp_ep_common *rcvr;
struct sctp_transport *transport = NULL;
struct sctp_chunk *chunk;
- struct sctphdr *sh;
union sctp_addr src;
union sctp_addr dest;
int family;
@@ -122,13 +121,11 @@ int sctp_rcv(struct sk_buff *skb)
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
- SCTP_INC_STATS_BH(net, SCTP_MIB_INSCTPPACKS);
+ __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS);
if (skb_linearize(skb))
goto discard_it;
- sh = sctp_hdr(skb);
-
/* Pull up the IP and SCTP headers. */
__skb_pull(skb, skb_transport_offset(skb));
if (skb->len < sizeof(struct sctphdr))
@@ -208,7 +205,7 @@ int sctp_rcv(struct sk_buff *skb)
*/
if (!asoc) {
if (sctp_rcv_ootb(skb)) {
- SCTP_INC_STATS_BH(net, SCTP_MIB_OUTOFBLUES);
+ __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
goto discard_release;
}
}
@@ -230,7 +227,7 @@ int sctp_rcv(struct sk_buff *skb)
chunk->rcvr = rcvr;
/* Remember the SCTP header. */
- chunk->sctp_hdr = sh;
+ chunk->sctp_hdr = sctp_hdr(skb);
/* Set the source and destination addresses of the incoming chunk. */
sctp_init_addrs(chunk, &src, &dest);
@@ -264,9 +261,9 @@ int sctp_rcv(struct sk_buff *skb)
skb = NULL; /* sctp_chunk_free already freed the skb */
goto discard_release;
}
- SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_BACKLOG);
+ __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG);
} else {
- SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_SOFTIRQ);
+ __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ);
sctp_inq_push(&chunk->rcvr->inqueue, chunk);
}
@@ -281,7 +278,7 @@ int sctp_rcv(struct sk_buff *skb)
return 0;
discard_it:
- SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_DISCARDS);
+ __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS);
kfree_skb(skb);
return 0;
@@ -331,6 +328,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
*/
sk = rcvr->sk;
+ local_bh_disable();
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
@@ -342,6 +340,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
sctp_inq_push(inqueue, chunk);
bh_unlock_sock(sk);
+ local_bh_enable();
/* If the chunk was backloged again, don't drop refs */
if (backloged)
@@ -532,7 +531,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
* servers this needs to be solved differently.
*/
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
*app = asoc;
*tpp = transport;
@@ -589,7 +588,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
skb->network_header = saveip;
skb->transport_header = savesctp;
if (!sk) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
}
/* Warning: The sock lock is held. Remember to call
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 7e8a16c77..b335ffcef 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -163,6 +163,9 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
chunk->singleton = 1;
ch = (sctp_chunkhdr_t *) chunk->skb->data;
chunk->data_accepted = 0;
+
+ if (chunk->asoc)
+ sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);
}
chunk->chunk_hdr = ch;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index ce46f1c7f..0657d18a8 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -162,7 +162,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
skb->network_header = saveip;
skb->transport_header = savesctp;
if (!sk) {
- ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS);
goto out;
}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 5cfac8d5d..4cb5aedfe 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -280,82 +280,38 @@ void sctp_eps_proc_exit(struct net *net)
struct sctp_ht_iter {
struct seq_net_private p;
struct rhashtable_iter hti;
+ int start_fail;
};
-static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq)
+static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
{
struct sctp_ht_iter *iter = seq->private;
- struct sctp_transport *t;
-
- t = rhashtable_walk_next(&iter->hti);
- for (; t; t = rhashtable_walk_next(&iter->hti)) {
- if (IS_ERR(t)) {
- if (PTR_ERR(t) == -EAGAIN)
- continue;
- break;
- }
+ int err = sctp_transport_walk_start(&iter->hti);
- if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) &&
- t->asoc->peer.primary_path == t)
- break;
+ if (err) {
+ iter->start_fail = 1;
+ return ERR_PTR(err);
}
- return t;
+ return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
}
-static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq,
- loff_t pos)
-{
- void *obj = SEQ_START_TOKEN;
-
- while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj))
- pos--;
-
- return obj;
-}
-
-static int sctp_transport_walk_start(struct seq_file *seq)
+static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
{
struct sctp_ht_iter *iter = seq->private;
- int err;
-
- err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti);
- if (err)
- return err;
-
- err = rhashtable_walk_start(&iter->hti);
- return err == -EAGAIN ? 0 : err;
+ if (iter->start_fail)
+ return;
+ sctp_transport_walk_stop(&iter->hti);
}
-static void sctp_transport_walk_stop(struct seq_file *seq)
+static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sctp_ht_iter *iter = seq->private;
- rhashtable_walk_stop(&iter->hti);
- rhashtable_walk_exit(&iter->hti);
-}
-
-static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
-{
- int err = sctp_transport_walk_start(seq);
-
- if (err)
- return ERR_PTR(err);
-
- return sctp_transport_get_idx(seq, *pos);
-}
-
-static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
-{
- sctp_transport_walk_stop(seq);
-}
-
-static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
++*pos;
- return sctp_transport_get_next(seq);
+ return sctp_transport_get_next(seq_file_net(seq), &iter->hti);
}
/* Display sctp associations (/proc/net/sctp/assocs). */
@@ -416,9 +372,9 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
}
static const struct seq_operations sctp_assoc_ops = {
- .start = sctp_assocs_seq_start,
- .next = sctp_assocs_seq_next,
- .stop = sctp_assocs_seq_stop,
+ .start = sctp_transport_seq_start,
+ .next = sctp_transport_seq_next,
+ .stop = sctp_transport_seq_stop,
.show = sctp_assocs_seq_show,
};
@@ -455,28 +411,6 @@ void sctp_assocs_proc_exit(struct net *net)
remove_proc_entry("assocs", net->sctp.proc_net_sctp);
}
-static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
-{
- int err = sctp_transport_walk_start(seq);
-
- if (err)
- return ERR_PTR(err);
-
- return sctp_transport_get_idx(seq, *pos);
-}
-
-static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
-
- return sctp_transport_get_next(seq);
-}
-
-static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
-{
- sctp_transport_walk_stop(seq);
-}
-
static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
{
struct sctp_association *assoc;
@@ -550,9 +484,9 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
}
static const struct seq_operations sctp_remaddr_ops = {
- .start = sctp_remaddr_seq_start,
- .next = sctp_remaddr_seq_next,
- .stop = sctp_remaddr_seq_stop,
+ .start = sctp_transport_seq_start,
+ .next = sctp_transport_seq_next,
+ .stop = sctp_transport_seq_stop,
.show = sctp_remaddr_seq_show,
};
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
new file mode 100644
index 000000000..f69edcf21
--- /dev/null
+++ b/net/sctp/sctp_diag.c
@@ -0,0 +1,497 @@
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+#include <net/sctp/sctp.h>
+
+static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info);
+
+/* define some functions to make asoc/ep fill look clean */
+static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
+ struct sock *sk,
+ struct sctp_association *asoc)
+{
+ union sctp_addr laddr, paddr;
+ struct dst_entry *dst;
+
+ laddr = list_entry(asoc->base.bind_addr.address_list.next,
+ struct sctp_sockaddr_entry, list)->a;
+ paddr = asoc->peer.primary_path->ipaddr;
+ dst = asoc->peer.primary_path->dst;
+
+ r->idiag_family = sk->sk_family;
+ r->id.idiag_sport = htons(asoc->base.bind_addr.port);
+ r->id.idiag_dport = htons(asoc->peer.port);
+ r->id.idiag_if = dst ? dst->dev->ifindex : 0;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr;
+ *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr;
+ } else
+#endif
+ {
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr;
+ r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr;
+ }
+
+ r->idiag_state = asoc->state;
+ r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
+ r->idiag_retrans = asoc->rtx_data_chunks;
+ r->idiag_expires = jiffies_to_msecs(
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] - jiffies);
+}
+
+static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
+ struct list_head *address_list)
+{
+ struct sctp_sockaddr_entry *laddr;
+ int addrlen = sizeof(struct sockaddr_storage);
+ int addrcnt = 0;
+ struct nlattr *attr;
+ void *info = NULL;
+
+ list_for_each_entry_rcu(laddr, address_list, list)
+ addrcnt++;
+
+ attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt);
+ if (!attr)
+ return -EMSGSIZE;
+
+ info = nla_data(attr);
+ list_for_each_entry_rcu(laddr, address_list, list) {
+ memcpy(info, &laddr->a, addrlen);
+ info += addrlen;
+ }
+
+ return 0;
+}
+
+static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
+ struct sctp_association *asoc)
+{
+ int addrlen = sizeof(struct sockaddr_storage);
+ struct sctp_transport *from;
+ struct nlattr *attr;
+ void *info = NULL;
+
+ attr = nla_reserve(skb, INET_DIAG_PEERS,
+ addrlen * asoc->peer.transport_count);
+ if (!attr)
+ return -EMSGSIZE;
+
+ info = nla_data(attr);
+ list_for_each_entry(from, &asoc->peer.transport_addr_list,
+ transports) {
+ memcpy(info, &from->ipaddr, addrlen);
+ info += addrlen;
+ }
+
+ return 0;
+}
+
+/* sctp asoc/ep fill*/
+static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
+ struct sk_buff *skb,
+ const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ int portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct list_head *addr_list;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ int ext = req->idiag_ext;
+ struct sctp_infox infox;
+ void *info = NULL;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ BUG_ON(!sk_fullsock(sk));
+
+ if (asoc) {
+ inet_diag_msg_sctpasoc_fill(r, sk, asoc);
+ } else {
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+ }
+
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ goto errout;
+
+ if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
+ u32 mem[SK_MEMINFO_VARS];
+ int amt;
+
+ if (asoc && asoc->ep->sndbuf_policy)
+ amt = asoc->sndbuf_used;
+ else
+ amt = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_WMEM_ALLOC] = amt;
+ if (asoc && asoc->ep->rcvbuf_policy)
+ amt = atomic_read(&asoc->rmem_alloc);
+ else
+ amt = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RMEM_ALLOC] = amt;
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+
+ if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0)
+ goto errout;
+ }
+
+ if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ struct nlattr *attr;
+
+ attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
+ sizeof(struct sctp_info),
+ INET_DIAG_PAD);
+ if (!attr)
+ goto errout;
+
+ info = nla_data(attr);
+ }
+ infox.sctpinfo = (struct sctp_info *)info;
+ infox.asoc = asoc;
+ sctp_diag_get_info(sk, r, &infox);
+
+ addr_list = asoc ? &asoc->base.bind_addr.address_list
+ : &ep->base.bind_addr.address_list;
+ if (inet_diag_msg_sctpladdrs_fill(skb, addr_list))
+ goto errout;
+
+ if (asoc && (ext & (1 << (INET_DIAG_CONG - 1))))
+ if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0)
+ goto errout;
+
+ if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc))
+ goto errout;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/* callback and param */
+struct sctp_comm_param {
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ const struct inet_diag_req_v2 *r;
+ const struct nlmsghdr *nlh;
+};
+
+static size_t inet_assoc_attr_size(struct sctp_association *asoc)
+{
+ int addrlen = sizeof(struct sockaddr_storage);
+ int addrcnt = 0;
+ struct sctp_sockaddr_entry *laddr;
+
+ list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list,
+ list)
+ addrcnt++;
+
+ return nla_total_size(sizeof(struct sctp_info))
+ + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ + nla_total_size(1) /* INET_DIAG_TOS */
+ + nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(addrlen * asoc->peer.transport_count)
+ + nla_total_size(addrlen * addrcnt)
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + 64;
+}
+
+static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
+{
+ struct sctp_association *assoc = tsp->asoc;
+ struct sock *sk = tsp->asoc->base.sk;
+ struct sctp_comm_param *commp = p;
+ struct sk_buff *in_skb = commp->skb;
+ const struct inet_diag_req_v2 *req = commp->r;
+ const struct nlmsghdr *nlh = commp->nlh;
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ int err;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ lock_sock(sk);
+ if (sk != assoc->base.sk) {
+ release_sock(sk);
+ sk = assoc->base.sk;
+ lock_sock(sk);
+ }
+ err = inet_sctp_diag_fill(sk, assoc, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh);
+ release_sock(sk);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ return err;
+}
+
+static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
+{
+ struct sctp_endpoint *ep = tsp->asoc->ep;
+ struct sctp_comm_param *commp = p;
+ struct sock *sk = ep->base.sk;
+ struct sk_buff *skb = commp->skb;
+ struct netlink_callback *cb = commp->cb;
+ const struct inet_diag_req_v2 *r = commp->r;
+ struct sctp_association *assoc =
+ list_entry(ep->asocs.next, struct sctp_association, asocs);
+ int err = 0;
+
+ /* find the ep only once through the transports by this condition */
+ if (tsp->asoc != assoc)
+ goto out;
+
+ if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
+ goto out;
+
+ lock_sock(sk);
+ if (sk != assoc->base.sk)
+ goto release;
+ list_for_each_entry(assoc, &ep->asocs, asocs) {
+ if (cb->args[4] < cb->args[1])
+ goto next;
+
+ if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != htons(assoc->peer.port) &&
+ r->id.idiag_dport)
+ goto next;
+
+ if (!cb->args[3] &&
+ inet_sctp_diag_fill(sk, NULL, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb->nlh) < 0) {
+ cb->args[3] = 1;
+ err = 2;
+ goto release;
+ }
+ cb->args[3] = 1;
+
+ if (inet_sctp_diag_fill(sk, assoc, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+ err = 2;
+ goto release;
+ }
+next:
+ cb->args[4]++;
+ }
+ cb->args[1] = 0;
+ cb->args[2]++;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+release:
+ release_sock(sk);
+ return err;
+out:
+ cb->args[2]++;
+ return err;
+}
+
+static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
+{
+ struct sctp_comm_param *commp = p;
+ struct sock *sk = ep->base.sk;
+ struct sk_buff *skb = commp->skb;
+ struct netlink_callback *cb = commp->cb;
+ const struct inet_diag_req_v2 *r = commp->r;
+ struct net *net = sock_net(skb->sk);
+ struct inet_sock *inet = inet_sk(sk);
+ int err = 0;
+
+ if (!net_eq(sock_net(sk), net))
+ goto out;
+
+ if (cb->args[4] < cb->args[1])
+ goto next;
+
+ if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs))
+ goto next;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ if (inet_sctp_diag_fill(sk, NULL, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh) < 0) {
+ err = 2;
+ goto out;
+ }
+next:
+ cb->args[4]++;
+out:
+ return err;
+}
+
+/* define the functions for sctp_diag_handler*/
+static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ struct sctp_infox *infox = (struct sctp_infox *)info;
+
+ if (infox->asoc) {
+ r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc);
+ r->idiag_wqueue = infox->asoc->sndbuf_used;
+ } else {
+ r->idiag_rqueue = sk->sk_ack_backlog;
+ r->idiag_wqueue = sk->sk_max_ack_backlog;
+ }
+ if (infox->sctpinfo)
+ sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
+}
+
+static int sctp_diag_dump_one(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ union sctp_addr laddr, paddr;
+ struct sctp_comm_param commp = {
+ .skb = in_skb,
+ .r = req,
+ .nlh = nlh,
+ };
+
+ if (req->sdiag_family == AF_INET) {
+ laddr.v4.sin_port = req->id.idiag_sport;
+ laddr.v4.sin_addr.s_addr = req->id.idiag_src[0];
+ laddr.v4.sin_family = AF_INET;
+
+ paddr.v4.sin_port = req->id.idiag_dport;
+ paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0];
+ paddr.v4.sin_family = AF_INET;
+ } else {
+ laddr.v6.sin6_port = req->id.idiag_sport;
+ memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, 64);
+ laddr.v6.sin6_family = AF_INET6;
+
+ paddr.v6.sin6_port = req->id.idiag_dport;
+ memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, 64);
+ paddr.v6.sin6_family = AF_INET6;
+ }
+
+ return sctp_transport_lookup_process(sctp_tsp_dump_one,
+ net, &laddr, &paddr, &commp);
+}
+
+static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ u32 idiag_states = r->idiag_states;
+ struct net *net = sock_net(skb->sk);
+ struct sctp_comm_param commp = {
+ .skb = skb,
+ .cb = cb,
+ .r = r,
+ };
+
+ /* eps hashtable dumps
+ * args:
+ * 0 : if it will traversal listen sock
+ * 1 : to record the sock pos of this time's traversal
+ * 4 : to work as a temporary variable to traversal list
+ */
+ if (cb->args[0] == 0) {
+ if (!(idiag_states & TCPF_LISTEN))
+ goto skip;
+ if (sctp_for_each_endpoint(sctp_ep_dump, &commp))
+ goto done;
+skip:
+ cb->args[0] = 1;
+ cb->args[1] = 0;
+ cb->args[4] = 0;
+ }
+
+ /* asocs by transport hashtable dump
+ * args:
+ * 1 : to record the assoc pos of this time's traversal
+ * 2 : to record the transport pos of this time's traversal
+ * 3 : to mark if we have dumped the ep info of the current asoc
+ * 4 : to work as a temporary variable to traversal list
+ */
+ if (!(idiag_states & ~TCPF_LISTEN))
+ goto done;
+ sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp);
+done:
+ cb->args[1] = cb->args[4];
+ cb->args[4] = 0;
+}
+
+static const struct inet_diag_handler sctp_diag_handler = {
+ .dump = sctp_diag_dump,
+ .dump_one = sctp_diag_dump_one,
+ .idiag_get_info = sctp_diag_get_info,
+ .idiag_type = IPPROTO_SCTP,
+ .idiag_info_size = sizeof(struct sctp_info),
+};
+
+static int __init sctp_diag_init(void)
+{
+ return inet_diag_register(&sctp_diag_handler);
+}
+
+static void __exit sctp_diag_exit(void)
+{
+ inet_diag_unregister(&sctp_diag_handler);
+}
+
+module_init(sctp_diag_init);
+module_exit(sctp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 41b081a64..aa3712259 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1218,6 +1218,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_cmd_seq_t *commands,
gfp_t gfp)
{
+ struct sock *sk = ep->base.sk;
+ struct sctp_sock *sp = sctp_sk(sk);
int error = 0;
int force;
sctp_cmd_t *cmd;
@@ -1738,6 +1740,10 @@ out:
error = sctp_outq_uncork(&asoc->outqueue, gfp);
} else if (local_cork)
error = sctp_outq_uncork(&asoc->outqueue, gfp);
+
+ if (sp->data_ready_signalled)
+ sp->data_ready_signalled = 0;
+
return error;
nomem:
error = -ENOMEM;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 878d28eda..7f5689a93 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4202,6 +4202,224 @@ static void sctp_shutdown(struct sock *sk, int how)
}
}
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+ struct sctp_info *info)
+{
+ struct sctp_transport *prim;
+ struct list_head *pos;
+ int mask;
+
+ memset(info, 0, sizeof(*info));
+ if (!asoc) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ info->sctpi_s_autoclose = sp->autoclose;
+ info->sctpi_s_adaptation_ind = sp->adaptation_ind;
+ info->sctpi_s_pd_point = sp->pd_point;
+ info->sctpi_s_nodelay = sp->nodelay;
+ info->sctpi_s_disable_fragments = sp->disable_fragments;
+ info->sctpi_s_v4mapped = sp->v4mapped;
+ info->sctpi_s_frag_interleave = sp->frag_interleave;
+ info->sctpi_s_type = sp->type;
+
+ return 0;
+ }
+
+ info->sctpi_tag = asoc->c.my_vtag;
+ info->sctpi_state = asoc->state;
+ info->sctpi_rwnd = asoc->a_rwnd;
+ info->sctpi_unackdata = asoc->unack_data;
+ info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
+ info->sctpi_instrms = asoc->c.sinit_max_instreams;
+ info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+ list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
+ info->sctpi_inqueue++;
+ list_for_each(pos, &asoc->outqueue.out_chunk_list)
+ info->sctpi_outqueue++;
+ info->sctpi_overall_error = asoc->overall_error_count;
+ info->sctpi_max_burst = asoc->max_burst;
+ info->sctpi_maxseg = asoc->frag_point;
+ info->sctpi_peer_rwnd = asoc->peer.rwnd;
+ info->sctpi_peer_tag = asoc->c.peer_vtag;
+
+ mask = asoc->peer.ecn_capable << 1;
+ mask = (mask | asoc->peer.ipv4_address) << 1;
+ mask = (mask | asoc->peer.ipv6_address) << 1;
+ mask = (mask | asoc->peer.hostname_address) << 1;
+ mask = (mask | asoc->peer.asconf_capable) << 1;
+ mask = (mask | asoc->peer.prsctp_capable) << 1;
+ mask = (mask | asoc->peer.auth_capable);
+ info->sctpi_peer_capable = mask;
+ mask = asoc->peer.sack_needed << 1;
+ mask = (mask | asoc->peer.sack_generation) << 1;
+ mask = (mask | asoc->peer.zero_window_announced);
+ info->sctpi_peer_sack = mask;
+
+ info->sctpi_isacks = asoc->stats.isacks;
+ info->sctpi_osacks = asoc->stats.osacks;
+ info->sctpi_opackets = asoc->stats.opackets;
+ info->sctpi_ipackets = asoc->stats.ipackets;
+ info->sctpi_rtxchunks = asoc->stats.rtxchunks;
+ info->sctpi_outofseqtsns = asoc->stats.outofseqtsns;
+ info->sctpi_idupchunks = asoc->stats.idupchunks;
+ info->sctpi_gapcnt = asoc->stats.gapcnt;
+ info->sctpi_ouodchunks = asoc->stats.ouodchunks;
+ info->sctpi_iuodchunks = asoc->stats.iuodchunks;
+ info->sctpi_oodchunks = asoc->stats.oodchunks;
+ info->sctpi_iodchunks = asoc->stats.iodchunks;
+ info->sctpi_octrlchunks = asoc->stats.octrlchunks;
+ info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
+
+ prim = asoc->peer.primary_path;
+ memcpy(&info->sctpi_p_address, &prim->ipaddr,
+ sizeof(struct sockaddr_storage));
+ info->sctpi_p_state = prim->state;
+ info->sctpi_p_cwnd = prim->cwnd;
+ info->sctpi_p_srtt = prim->srtt;
+ info->sctpi_p_rto = jiffies_to_msecs(prim->rto);
+ info->sctpi_p_hbinterval = prim->hbinterval;
+ info->sctpi_p_pathmaxrxt = prim->pathmaxrxt;
+ info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay);
+ info->sctpi_p_ssthresh = prim->ssthresh;
+ info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked;
+ info->sctpi_p_flight_size = prim->flight_size;
+ info->sctpi_p_error = prim->error_count;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
+
+/* use callback to avoid exporting the core structure */
+int sctp_transport_walk_start(struct rhashtable_iter *iter)
+{
+ int err;
+
+ err = rhashtable_walk_init(&sctp_transport_hashtable, iter,
+ GFP_KERNEL);
+ if (err)
+ return err;
+
+ err = rhashtable_walk_start(iter);
+ if (err && err != -EAGAIN) {
+ rhashtable_walk_stop(iter);
+ rhashtable_walk_exit(iter);
+ return err;
+ }
+
+ return 0;
+}
+
+void sctp_transport_walk_stop(struct rhashtable_iter *iter)
+{
+ rhashtable_walk_stop(iter);
+ rhashtable_walk_exit(iter);
+}
+
+struct sctp_transport *sctp_transport_get_next(struct net *net,
+ struct rhashtable_iter *iter)
+{
+ struct sctp_transport *t;
+
+ t = rhashtable_walk_next(iter);
+ for (; t; t = rhashtable_walk_next(iter)) {
+ if (IS_ERR(t)) {
+ if (PTR_ERR(t) == -EAGAIN)
+ continue;
+ break;
+ }
+
+ if (net_eq(sock_net(t->asoc->base.sk), net) &&
+ t->asoc->peer.primary_path == t)
+ break;
+ }
+
+ return t;
+}
+
+struct sctp_transport *sctp_transport_get_idx(struct net *net,
+ struct rhashtable_iter *iter,
+ int pos)
+{
+ void *obj = SEQ_START_TOKEN;
+
+ while (pos && (obj = sctp_transport_get_next(net, iter)) &&
+ !IS_ERR(obj))
+ pos--;
+
+ return obj;
+}
+
+int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
+ void *p) {
+ int err = 0;
+ int hash = 0;
+ struct sctp_ep_common *epb;
+ struct sctp_hashbucket *head;
+
+ for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize;
+ hash++, head++) {
+ read_lock(&head->lock);
+ sctp_for_each_hentry(epb, &head->chain) {
+ err = cb(sctp_ep(epb), p);
+ if (err)
+ break;
+ }
+ read_unlock(&head->lock);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sctp_for_each_endpoint);
+
+int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
+ struct net *net,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr, void *p)
+{
+ struct sctp_transport *transport;
+ int err = 0;
+
+ rcu_read_lock();
+ transport = sctp_addrs_lookup_transport(net, laddr, paddr);
+ if (!transport || !sctp_transport_hold(transport))
+ goto out;
+ err = cb(transport, p);
+ sctp_transport_put(transport);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
+
+int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
+ struct net *net, int pos, void *p) {
+ struct rhashtable_iter hti;
+ void *obj;
+ int err;
+
+ err = sctp_transport_walk_start(&hti);
+ if (err)
+ return err;
+
+ sctp_transport_get_idx(net, &hti, pos);
+ obj = sctp_transport_get_next(net, &hti);
+ for (; obj && !IS_ERR(obj); obj = sctp_transport_get_next(net, &hti)) {
+ struct sctp_transport *transport = obj;
+
+ if (!sctp_transport_hold(transport))
+ continue;
+ err = cb(transport, p);
+ sctp_transport_put(transport);
+ if (err)
+ break;
+ }
+ sctp_transport_walk_stop(&hti);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sctp_for_each_transport);
+
/* 7.2.1 Association Status (SCTP_STATUS)
* Applications can retrieve current status information about an
@@ -6430,6 +6648,8 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
poll_wait(file, sk_sleep(sk), wait);
+ sock_rps_record_flow(sk);
+
/* A TCP-style listening socket becomes readable when the accept queue
* is not empty.
*/
@@ -6764,13 +6984,11 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
* However, this function was correct in any case. 8)
*/
if (flags & MSG_PEEK) {
- spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
atomic_inc(&skb->users);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
} else {
- skb = skb_dequeue(&sk->sk_receive_queue);
+ skb = __skb_dequeue(&sk->sk_receive_queue);
}
if (skb)
@@ -7186,6 +7404,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_lingertime = sk->sk_lingertime;
newsk->sk_rcvtimeo = sk->sk_rcvtimeo;
newsk->sk_sndtimeo = sk->sk_sndtimeo;
+ newsk->sk_rxhash = sk->sk_rxhash;
newinet = inet_sk(newsk);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index ce469d648..ec166d2bd 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -141,7 +141,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
*/
if (!skb_queue_empty(&sp->pd_lobby)) {
struct list_head *list;
- sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue);
+ skb_queue_splice_tail_init(&sp->pd_lobby,
+ &sk->sk_receive_queue);
list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
INIT_LIST_HEAD(list);
return 1;
@@ -193,6 +194,7 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
{
struct sock *sk = ulpq->asoc->base.sk;
+ struct sctp_sock *sp = sctp_sk(sk);
struct sk_buff_head *queue, *skb_list;
struct sk_buff *skb = sctp_event2skb(event);
int clear_pd = 0;
@@ -210,7 +212,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
sk_incoming_cpu_update(sk);
}
/* Check if the user wishes to receive this event. */
- if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
+ if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
goto out_free;
/* If we are in partial delivery mode, post to the lobby until
@@ -218,7 +220,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
* the association the cause of the partial delivery.
*/
- if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) {
+ if (atomic_read(&sp->pd_mode) == 0) {
queue = &sk->sk_receive_queue;
} else {
if (ulpq->pd_mode) {
@@ -230,7 +232,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
if ((event->msg_flags & MSG_NOTIFICATION) ||
(SCTP_DATA_NOT_FRAG ==
(event->msg_flags & SCTP_DATA_FRAG_MASK)))
- queue = &sctp_sk(sk)->pd_lobby;
+ queue = &sp->pd_lobby;
else {
clear_pd = event->msg_flags & MSG_EOR;
queue = &sk->sk_receive_queue;
@@ -241,10 +243,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
* can queue this to the receive queue instead
* of the lobby.
*/
- if (sctp_sk(sk)->frag_interleave)
+ if (sp->frag_interleave)
queue = &sk->sk_receive_queue;
else
- queue = &sctp_sk(sk)->pd_lobby;
+ queue = &sp->pd_lobby;
}
}
@@ -252,7 +254,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
* collected on a list.
*/
if (skb_list)
- sctp_skb_list_tail(skb_list, queue);
+ skb_queue_splice_tail_init(skb_list, queue);
else
__skb_queue_tail(queue, skb);
@@ -263,8 +265,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
if (clear_pd)
sctp_ulpq_clear_pd(ulpq);
- if (queue == &sk->sk_receive_queue)
+ if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
+ sp->data_ready_signalled = 1;
sk->sk_data_ready(sk);
+ }
return 1;
out_free:
@@ -1125,11 +1129,13 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
struct sctp_ulpevent *ev = NULL;
struct sock *sk;
+ struct sctp_sock *sp;
if (!ulpq->pd_mode)
return;
sk = ulpq->asoc->base.sk;
+ sp = sctp_sk(sk);
if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
&sctp_sk(sk)->subscribe))
ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
@@ -1139,6 +1145,8 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
/* If there is data waiting, send it up the socket now. */
- if (sctp_ulpq_clear_pd(ulpq) || ev)
+ if ((sctp_ulpq_clear_pd(ulpq) || ev) && !sp->data_ready_signalled) {
+ sp->data_ready_signalled = 1;
sk->sk_data_ready(sk);
+ }
}
diff --git a/net/socket.c b/net/socket.c
index 5f77a8e93..a1bd16106 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -466,7 +466,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
-static ssize_t sockfs_getxattr(struct dentry *dentry,
+static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size)
{
const char *proto_name;
@@ -587,22 +587,19 @@ void sock_release(struct socket *sock)
}
EXPORT_SYMBOL(sock_release);
-void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags)
+void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
{
u8 flags = *tx_flags;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
+ if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
flags |= SKBTX_HW_TSTAMP;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
+ if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
flags |= SKBTX_SW_TSTAMP;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)
+ if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
flags |= SKBTX_SCHED_TSTAMP;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)
- flags |= SKBTX_ACK_TSTAMP;
-
*tx_flags = flags;
}
EXPORT_SYMBOL(__sock_tx_timestamp);
@@ -709,17 +706,16 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+ int flags)
{
- return sock->ops->recvmsg(sock, msg, size, flags);
+ return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
}
-int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
- int flags)
+int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
- int err = security_socket_recvmsg(sock, msg, size, flags);
+ int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
- return err ?: sock_recvmsg_nosec(sock, msg, size, flags);
+ return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
EXPORT_SYMBOL(sock_recvmsg);
@@ -746,7 +742,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
set_fs(KERNEL_DS);
- result = sock_recvmsg(sock, msg, size, flags);
+ result = sock_recvmsg(sock, msg, flags);
set_fs(oldfs);
return result;
}
@@ -796,7 +792,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!iov_iter_count(to)) /* Match SYS5 behaviour */
return 0;
- res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags);
+ res = sock_recvmsg(sock, &msg, msg.msg_flags);
*to = msg.msg_iter;
return res;
}
@@ -1046,7 +1042,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
return -EINVAL;
lock_sock(sk);
- wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+ wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list)
@@ -1696,7 +1692,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
msg.msg_iocb = NULL;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
- err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
+ err = sock_recvmsg(sock, &msg, flags);
if (err >= 0 && addr != NULL) {
err2 = move_addr_to_user(&address,
@@ -2073,7 +2069,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
unsigned long cmsg_ptr;
- int total_len, len;
+ int len;
ssize_t err;
/* kernel mode address */
@@ -2091,7 +2087,6 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
if (err < 0)
return err;
- total_len = iov_iter_count(&msg_sys->msg_iter);
cmsg_ptr = (unsigned long)msg_sys->msg_control;
msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
@@ -2101,8 +2096,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
- err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
- total_len, flags);
+ err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags);
if (err < 0)
goto out_freeiov;
len = err;
@@ -2174,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
- struct timespec end_time;
+ struct timespec64 end_time;
+ struct timespec64 timeout64;
if (timeout &&
poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2226,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
flags |= MSG_DONTWAIT;
if (timeout) {
- ktime_get_ts(timeout);
- *timeout = timespec_sub(end_time, *timeout);
+ ktime_get_ts64(&timeout64);
+ *timeout = timespec64_to_timespec(
+ timespec64_sub(end_time, timeout64));
if (timeout->tv_sec < 0) {
timeout->tv_sec = timeout->tv_nsec = 0;
break;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674d..040ff627c 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
*/
struct rpc_cred *
rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
- int flags)
+ int flags, gfp_t gfp)
{
LIST_HEAD(free);
struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
if (flags & RPCAUTH_LOOKUP_RCU)
return ERR_PTR(-ECHILD);
- new = auth->au_ops->crcreate(auth, acred, flags);
+ new = auth->au_ops->crcreate(auth, acred, flags, gfp);
if (IS_ERR(new)) {
cred = new;
goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
new = rpcauth_bind_new_cred(task, lookupflags);
if (IS_ERR(new))
return PTR_ERR(new);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
+ put_rpccred(req->rq_cred);
req->rq_cred = new;
return 0;
}
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
void
put_rpccred(struct rpc_cred *cred)
{
+ if (cred == NULL)
+ return;
/* Fast path for unhashed credentials */
if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
if (atomic_dec_and_test(&cred->cr_count))
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b182..54dd3fdea 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
}
EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
struct rpc_cred *rpc_lookup_cred_nonblock(void)
{
return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
static struct rpc_cred *
generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
}
static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct generic_cred *gcred;
- gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+ gcred = kmalloc(sizeof(*gcred), gfp);
if (gcred == NULL)
return ERR_PTR(-ENOMEM);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8..e64ae93d5 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
static struct rpc_cred *
gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags);
+ return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
}
static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
struct gss_cred *cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
__func__, from_kuid(&init_user_ns, acred->uid),
auth->au_flavor);
- if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+ if (!(cred = kzalloc(sizeof(*cred), gfp)))
goto out_err;
rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 4605dc73d..e085f5ae1 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
struct rsc *found;
memset(&rsci, 0, sizeof(rsci));
- if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
- return NULL;
+ rsci.handle.data = handle->data;
+ rsci.handle.len = handle->len;
found = rsc_lookup(cd, &rsci);
- rsc_free(&rsci);
if (!found)
return NULL;
if (cache_check(cd, &found->h, NULL))
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c..9f65452b7 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
static struct rpc_cred *
unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags);
+ return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
}
static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct unx_cred *cred;
unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
from_kuid(&init_user_ns, acred->uid),
from_kgid(&init_user_ns, acred->gid));
- if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+ if (!(cred = kmalloc(sizeof(*cred), gfp)))
return ERR_PTR(-ENOMEM);
rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 837dd910a..2808d550d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1433,6 +1433,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_payload);
/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ size_t ret;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ ret = xprt->ops->bc_maxpayload(xprt);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
+/**
* rpc_get_timeout - Get timeout for transport in units of HZ
* @clnt: RPC client to query
*/
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index de70c7802..f217c348b 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -155,7 +155,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
struct xdr_skb_reader desc;
desc.skb = skb;
- desc.offset = sizeof(struct udphdr);
+ desc.offset = 0;
desc.count = skb->len - desc.offset;
if (skb_csum_unnecessary(skb))
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 7231cb413..4f01f6310 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -246,13 +246,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
svc_xprt_received(new);
}
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags)
{
struct svc_xprt_class *xcl;
- dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
spin_lock(&svc_xprt_class_lock);
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
struct svc_xprt *newxprt;
@@ -276,12 +275,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
}
err:
spin_unlock(&svc_xprt_class_lock);
- dprintk("svc: transport %s not found\n", xprt_name);
-
/* This errno is exposed to user space. Provide a reasonable
* perror msg for a bad transport. */
return -EPROTONOSUPPORT;
}
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, const int family,
+ const unsigned short port, int flags)
+{
+ int err;
+
+ dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ if (err == -EPROTONOSUPPORT) {
+ request_module("svc%s", xprt_name);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ }
+ if (err)
+ dprintk("svc: transport %s not found, err %d\n",
+ xprt_name, err);
+ return err;
+}
EXPORT_SYMBOL_GPL(svc_create_xprt);
/*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1413cdcc1..dadfec66d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -85,8 +85,7 @@ static void svc_reclassify_socket(struct socket *sock)
{
struct sock *sk = sock->sk;
- WARN_ON_ONCE(sock_owned_by_user(sk));
- if (sock_owned_by_user(sk))
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
return;
switch (sk->sk_family) {
@@ -617,7 +616,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
svsk->sk_sk->sk_stamp = skb->tstamp;
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
- len = skb->len - sizeof(struct udphdr);
+ len = skb->len;
rqstp->rq_arg.len = len;
rqstp->rq_prot = IPPROTO_UDP;
@@ -641,8 +640,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
skb_free_datagram_locked(svsk->sk_sk, skb);
} else {
/* we can use it in-place */
- rqstp->rq_arg.head[0].iov_base = skb->data +
- sizeof(struct udphdr);
+ rqstp->rq_arg.head[0].iov_base = skb->data;
rqstp->rq_arg.head[0].iov_len = len;
if (skb_checksum_complete(skb))
goto out_free;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 6bdb38652..c4f3cc0c0 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
xdr_set_iov(xdr, buf->head, buf->len);
else if (buf->page_len != 0)
xdr_set_page_base(xdr, 0, buf->len);
+ else
+ xdr_set_iov(xdr, buf->head, buf->len);
if (p != NULL && p > xdr->p && xdr->end >= p) {
xdr->nwords -= p - xdr->p;
xdr->p = p;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640e..87762d976 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
}
/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ size_t maxmsg;
+
+ maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+/**
* rpcrdma_bc_marshal_reply - Send backwards direction reply
* @rqst: buffer containing RPC reply data
*
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index b289e1065..6326ebe8b 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -35,10 +35,71 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+ fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+ return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+ struct workqueue_struct *wq;
+
+ if (!fmr_recovery_wq)
+ return;
+
+ wq = fmr_recovery_wq;
+ fmr_recovery_wq = NULL;
+ destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+ LIST_HEAD(l);
+
+ list_add(&mw->fmr.fmr->list, &l);
+ return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+ mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+ __fmr_unmap(mw);
+ rpcrdma_put_mw(r_xprt, mw);
+ return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+ INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+ queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ RPCRDMA_MAX_FMR_SGES));
return 0;
}
@@ -48,7 +109,7 @@ static size_t
fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+ RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
if (IS_ERR(r->fmr.fmr))
goto out_fmr_err;
+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
}
@@ -104,15 +166,6 @@ out:
return rc;
}
-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
- LIST_HEAD(l);
-
- list_add(&r->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@@ -183,15 +236,10 @@ static void
__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
int nsegs = seg->mr_nsegs;
- seg->rl_mw = NULL;
-
while (nsegs--)
rpcrdma_unmap_one(device, seg++);
-
- rpcrdma_put_mw(r_xprt, mw);
}
/* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
seg = &req->rl_segments[i];
__fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, seg->rl_mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
}
req->rl_nchunks = 0;
}
-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
*/
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- int rc, nsegs = seg->mr_nsegs;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
- dprintk("RPC: %s: FMR %p\n", __func__, mw);
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
- seg1->rl_mw = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia->ri_device, seg++);
- rc = __fmr_unmap(mw);
- if (rc)
- goto out_err;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
+ if (sync) {
+ /* ORDER */
+ __fmr_unmap(mw);
+ __fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, mw);
+ } else {
+ __fmr_dma_unmap(r_xprt, seg);
+ __fmr_queue_recovery(mw);
+ }
-out_err:
- /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
- * will attempt to release it when the transport is destroyed.
- */
- dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
- return nsegs;
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
}
static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
- .ro_unmap = fmr_op_unmap,
+ .ro_unmap_safe = fmr_op_unmap_safe,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c250924a9..c0947544b 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
destroy_workqueue(wq);
}
+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ rc = ib_dereg_mr(f->fr_mr);
+ if (rc) {
+ pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+ rc, r);
+ return rc;
+ }
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_mr)) {
+ pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+ PTR_ERR(f->fr_mr), r);
+ return PTR_ERR(f->fr_mr);
+ }
+
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ f->fr_state = FRMR_IS_INVALID;
+ return 0;
+}
+
+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_frmr *f = &mw->frmr;
+ int rc;
+
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+ if (rc)
+ return;
+
+ rpcrdma_put_mw(r_xprt, mw);
+}
+
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
*
@@ -109,26 +150,10 @@ static void
__frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- frmr.fr_work);
- struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
- if (ib_dereg_mr(r->frmr.fr_mr))
- goto out_fail;
+ mw_work);
- r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(r->frmr.fr_mr))
- goto out_fail;
-
- dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
- r->frmr.fr_state = FRMR_IS_INVALID;
- rpcrdma_put_mw(r_xprt, r);
+ __frwr_reset_and_unmap(r->mw_xprt, r);
return;
-
-out_fail:
- pr_warn("RPC: %s: FRMR %p unrecovered\n",
- __func__, r);
}
/* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
- INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+ INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->mw_work);
}
static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
if (IS_ERR(f->fr_mr))
goto out_mr_err;
- f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
- if (!f->sg)
+ f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+ if (!f->fr_sg)
goto out_list_err;
- sg_init_table(f->sg, depth);
+ sg_init_table(f->fr_sg, depth);
init_completion(&f->fr_linv_done);
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- kfree(r->frmr.sg);
+ kfree(r->frmr.fr_sg);
}
static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
depth;
}
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ ia->ri_max_frmr_depth));
return 0;
}
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+ RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
}
static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
return rc;
}
+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
- r->frmr.fr_xprt = r_xprt;
}
return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->sg[i],
+ sg_set_page(&frmr->fr_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
seg->mr_len);
++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->sg_nents = i;
+ frmr->fr_nents = i;
+ frmr->fr_dir = direction;
- dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
if (!dma_nents) {
pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->sg, frmr->sg_nents);
+ __func__, frmr->fr_sg, frmr->fr_nents);
return -ENOMEM;
}
- n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
- if (unlikely(n != frmr->sg_nents)) {
+ n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+ if (unlikely(n != frmr->fr_nents)) {
pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->sg_nents);
+ __func__, frmr->fr_mr, n, frmr->fr_nents);
rc = n < 0 ? n : -EINVAL;
goto out_senderr;
}
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->sg_nents, mr->length);
+ __func__, mw, frmr->fr_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->mr_dir = direction;
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_nsegs = frmr->fr_nents;
seg1->mr_len = mr->length;
- return frmr->sg_nents;
+ return frmr->fr_nents;
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
__frwr_queue_recovery(mw);
return rc;
}
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
return invalidate_wr;
}
-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int rc)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
- struct rpcrdma_frmr *f = &mw->frmr;
-
- seg->rl_mw = NULL;
-
- ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
-
- if (!rc)
- rpcrdma_put_mw(r_xprt, mw);
- else
- __frwr_queue_recovery(mw);
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
struct rpcrdma_mr_seg *seg;
unsigned int i, nchunks;
struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless ri_id->qp is a valid pointer.
*/
rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
- if (rc) {
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- rdma_disconnect(ia->ri_id);
- goto unmap;
- }
+ if (rc)
+ goto reset_mrs;
wait_for_completion(&f->fr_linv_done);
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
unmap:
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ seg->rl_mw = NULL;
- __frwr_dma_unmap(r_xprt, seg, rc);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+ f->fr_dir);
+ rpcrdma_put_mw(r_xprt, mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
}
req->rl_nchunks = 0;
-}
+ return;
-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->frmr;
- struct ib_send_wr *invalidate_wr, *bad_wr;
- int rc, nsegs = seg->mr_nsegs;
+reset_mrs:
+ pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- dprintk("RPC: %s: FRMR %p\n", __func__, mw);
+ /* Find and reset the MRs in the LOCAL_INV WRs that did not
+ * get posted. This is synchronous, and slow.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ f = &mw->frmr;
- seg1->rl_mw = NULL;
- frmr->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &mw->frmr.fr_invwr;
+ if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ __frwr_reset_mr(ia, mw);
+ bad_wr = bad_wr->next;
+ }
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- frmr->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &frmr->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
+ i += seg->mr_nsegs;
+ }
+ goto unmap;
+}
- ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
- read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc)
- goto out_err;
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
-out_err:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return nsegs;
+ if (sync)
+ __frwr_reset_and_unmap(r_xprt, mw);
+ else
+ __frwr_queue_recovery(mw);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
}
static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
- .ro_unmap = frwr_op_unmap,
+ .ro_unmap_safe = frwr_op_unmap_safe,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 481b9b6f4..3750596cc 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
__func__, PTR_ERR(mr));
return -ENOMEM;
}
-
ia->ri_dma_mr = mr;
+
+ rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ RPCRDMA_MAX_HDR_SEGS));
return 0;
}
@@ -47,7 +50,7 @@ static size_t
physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt));
+ RPCRDMA_MAX_HDR_SEGS);
}
static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
return 1;
}
-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- rpcrdma_unmap_one(ia->ri_device, seg);
- return 1;
-}
-
/* DMA unmap all memory regions that were mapped for "req".
*/
static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
rpcrdma_unmap_one(device, &req->rl_segments[i++]);
}
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ physical_op_unmap_sync(r_xprt, req);
+}
+
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
.ro_unmap_sync = physical_op_unmap_sync,
- .ro_unmap = physical_op_unmap,
+ .ro_unmap_safe = physical_op_unmap_safe,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 888823bb6..35a81096e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
rpcrdma_replych
};
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static const char transfertypes[][12] = {
- "pure inline", /* no chunks */
- " read chunk", /* some argument via rdma read */
- "*read chunk", /* entire request via rdma read */
- "write chunk", /* some result via rdma write */
+ "inline", /* no chunks */
+ "read list", /* some argument via rdma read */
+ "*read list", /* entire request via rdma read */
+ "write list", /* some result via rdma write */
"reply chunk" /* entire reply via rdma write */
};
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Read list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+ /* Minimal Read chunk size */
+ size += sizeof(__be32); /* segment count */
+ size += sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max call header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message. The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Write list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = sizeof(__be32); /* segment count */
+ size += maxsegs * sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max reply header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+ struct rpcrdma_create_data_internal *cdata,
+ unsigned int maxsegs)
+{
+ ia->ri_max_inline_write = cdata->inline_wsize -
+ rpcrdma_max_call_header_size(maxsegs);
+ ia->ri_max_inline_read = cdata->inline_rsize -
+ rpcrdma_max_reply_header_size(maxsegs);
+}
/* The client can send a request inline as long as the RPCRDMA header
* plus the RPC call fit under the transport's inline limit. If the
* combined call message size exceeds that limit, the client must use
* the read chunk list for this operation.
*/
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
}
/* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
* limit, the client must provide a write list or a reply chunk for
* this request.
*/
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
}
-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- * Assume check against THRESHOLD has been done, and chunks are required.
- * Assume only encoding one list entry for read|write chunks. The NFSv3
- * protocol is simple enough to allow this as it only has a single "bulk
- * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+ *iptr++ = cpu_to_be32(seg->mr_rkey);
+ *iptr++ = cpu_to_be32(seg->mr_len);
+ return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* N elements, position P (same P for all chunks of same arg!):
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
*
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ unsigned int pos;
+ int n, nsegs;
+
+ if (rtype == rpcrdma_noch) {
+ *iptr++ = xdr_zero; /* item not present */
+ return iptr;
+ }
+
+ pos = rqst->rq_snd_buf.head[0].iov_len;
+ if (rtype == rpcrdma_areadch)
+ pos = 0;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ *iptr++ = xdr_one; /* item present */
+
+ /* All read segments in this chunk
+ * have the same "position".
+ */
+ *iptr++ = cpu_to_be32(pos);
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: read segment pos %u "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__, pos,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.read_chunk_count++;
+ req->rl_nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Finish Read list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
* Write chunklist (a list of (one) counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO - 0
*
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, __be32 *iptr,
+ enum rpcrdma_chunktype wtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
+
+ if (wtype != rpcrdma_writech) {
+ *iptr++ = xdr_zero; /* no Write list present */
+ return iptr;
+ }
+
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+ rqst->rq_rcv_buf.head[0].iov_len,
+ wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ *iptr++ = xdr_one; /* Write list present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: write segment "
+ "%d@0x016%llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.write_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
+ nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Update count of segments in this Write chunk */
+ *segcount = cpu_to_be32(nchunks);
+
+ /* Finish Write list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
* Reply chunk (a counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
*
- * Returns positive RPC/RDMA header size, or negative errno.
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
*/
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
- struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int n, nsegs, nchunks = 0;
- unsigned int pos;
- struct rpcrdma_mr_seg *seg = req->rl_segments;
- struct rpcrdma_read_chunk *cur_rchunk = NULL;
- struct rpcrdma_write_array *warray = NULL;
- struct rpcrdma_write_chunk *cur_wchunk = NULL;
- __be32 *iptr = headerp->rm_body.rm_chunks;
- int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
- if (type == rpcrdma_readch || type == rpcrdma_areadch) {
- /* a read chunk - server will RDMA Read our memory */
- cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
- } else {
- /* a write or reply chunk - server will RDMA Write our memory */
- *iptr++ = xdr_zero; /* encode a NULL read chunk list */
- if (type == rpcrdma_replych)
- *iptr++ = xdr_zero; /* a NULL write chunk list */
- warray = (struct rpcrdma_write_array *) iptr;
- cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
- }
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
- if (type == rpcrdma_replych || type == rpcrdma_areadch)
- pos = 0;
- else
- pos = target->head[0].iov_len;
+ if (wtype != rpcrdma_replych) {
+ *iptr++ = xdr_zero; /* no Reply chunk present */
+ return iptr;
+ }
- nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
if (nsegs < 0)
- return nsegs;
+ return ERR_PTR(nsegs);
- map = r_xprt->rx_ia.ri_ops->ro_map;
+ *iptr++ = xdr_one; /* Reply chunk present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
do {
- n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
if (n <= 0)
- goto out;
- if (cur_rchunk) { /* read */
- cur_rchunk->rc_discrim = xdr_one;
- /* all read chunks have the same "position" */
- cur_rchunk->rc_position = cpu_to_be32(pos);
- cur_rchunk->rc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_rchunk->rc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_rchunk->rc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: read chunk "
- "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, pos, n < nsegs ? "more" : "last");
- cur_rchunk++;
- r_xprt->rx_stats.read_chunk_count++;
- } else { /* write/reply */
- cur_wchunk->wc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_wchunk->wc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_wchunk->wc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: %s chunk "
- "elem %d@0x%llx:0x%x (%s)\n", __func__,
- (type == rpcrdma_replych) ? "reply" : "write",
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
- cur_wchunk++;
- if (type == rpcrdma_replych)
- r_xprt->rx_stats.reply_chunk_count++;
- else
- r_xprt->rx_stats.write_chunk_count++;
- r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- }
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: reply segment "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.reply_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
+ req->rl_nextseg = seg;
- /* success. all failures return above */
- req->rl_nchunks = nchunks;
-
- /*
- * finish off header. If write, marshal discrim and nchunks.
- */
- if (cur_rchunk) {
- iptr = (__be32 *) cur_rchunk;
- *iptr++ = xdr_zero; /* finish the read chunk list */
- *iptr++ = xdr_zero; /* encode a NULL write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- } else {
- warray->wc_discrim = xdr_one;
- warray->wc_nchunks = cpu_to_be32(nchunks);
- iptr = (__be32 *) cur_wchunk;
- if (type == rpcrdma_writech) {
- *iptr++ = xdr_zero; /* finish the write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- }
- }
-
- /*
- * Return header size.
- */
- return (unsigned char *)iptr - (unsigned char *)headerp;
+ /* Update count of segments in the Reply chunk */
+ *segcount = cpu_to_be32(nchunks);
-out:
- for (pos = 0; nchunks--;)
- pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[pos]);
- return n;
+ return iptr;
}
/*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
* Marshal a request: the primary job of this routine is to choose
* the transfer modes. See comments below.
*
- * Uses multiple RDMA IOVs for a request:
- * [0] -- RPC RDMA header, which uses memory from the *start* of the
- * preregistered buffer that already holds the RPC data in
- * its middle.
- * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- * [2] -- optional padding.
- * [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ * [0] -- RPC RDMA header
+ * [1] -- the RPC header/data
*
* Returns zero on success, otherwise a negative errno.
*/
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- char *base;
- size_t rpclen;
- ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ ssize_t hdrlen;
+ size_t rpclen;
+ __be32 *iptr;
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
return rpcrdma_bc_marshal_reply(rqst);
#endif
- /*
- * rpclen gets amount of data in first buffer, which is the
- * pre-registered buffer.
- */
- base = rqst->rq_svec[0].iov_base;
- rpclen = rqst->rq_svec[0].iov_len;
-
headerp = rdmab_to_msg(req->rl_rdmabuf);
/* don't byte-swap XID, it's already done in request */
headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
- * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
* return as inline.
+ * o Large read ops return data as write chunk(s), header as
+ * inline.
* o Large non-read ops return as a single reply chunk.
*/
- if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- wtype = rpcrdma_writech;
- else if (rpcrdma_results_inline(rqst))
+ if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
+ else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* that both has a data payload, and whose non-data arguments
* by themselves are larger than the inline threshold.
*/
- if (rpcrdma_args_inline(rqst)) {
+ if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
+ rpcrdma_inline_pullup(rqst);
+ rpclen = rqst->rq_svec[0].iov_len;
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ rpclen = rqst->rq_svec[0].iov_len;
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
} else {
r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rpclen = 0;
}
- /* The following simplification is not true forever */
- if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
- wtype = rpcrdma_noch;
- if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
- dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
- __func__);
- return -EIO;
- }
-
- hdrlen = RPCRDMA_HDRLEN_MIN;
-
- /*
- * Pull up any extra send data into the preregistered buffer.
- * When padding is in use and applies to the transfer, insert
- * it and change the message type.
+ /* This implementation supports the following combinations
+ * of chunk lists in one RPC-over-RDMA Call message:
+ *
+ * - Read list
+ * - Write list
+ * - Reply chunk
+ * - Read list + Reply chunk
+ *
+ * It might not yet support the following combinations:
+ *
+ * - Read list + Write list
+ *
+ * It does not support the following combinations:
+ *
+ * - Write list + Reply chunk
+ * - Read list + Write list + Reply chunk
+ *
+ * This implementation supports only a single chunk in each
+ * Read or Write list. Thus for example the client cannot
+ * send a Call message with a Position Zero Read chunk and a
+ * regular Read chunk at the same time.
*/
- if (rtype == rpcrdma_noch) {
-
- rpcrdma_inline_pullup(rqst);
-
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- } else if (rtype == rpcrdma_readch)
- rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
- if (rtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
- headerp, rtype);
- wtype = rtype; /* simplify dprintk */
-
- } else if (wtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
- headerp, wtype);
- }
- if (hdrlen < 0)
- return hdrlen;
+ req->rl_nchunks = 0;
+ req->rl_nextseg = req->rl_segments;
+ iptr = headerp->rm_body.rm_chunks;
+ iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
+
+ if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ goto out_overflow;
+
+ dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+ rqst->rq_task->tk_pid, __func__,
+ transfertypes[rtype], transfertypes[wtype],
+ hdrlen, rpclen);
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
- " headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen,
- headerp, base, rdmab_lkey(req->rl_rdmabuf));
-
- /*
- * initialize send_iov's - normally only two: rdma chunk header and
- * single preregistered RPC header buffer, but if padding is present,
- * then use a preregistered (and zeroed) pad buffer between the RPC
- * header and any write data. In all non-rdma cases, any following
- * data has been copied into the RPC header buffer.
- */
req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_niovs = 2;
return 0;
+
+out_overflow:
+ pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+ hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
+ /* Terminate this RPC. Chunks registered above will be
+ * released by xprt_release -> xprt_rmda_free .
+ */
+ return -EIO;
+
+out_unmap:
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+ return PTR_ERR(iptr);
}
/*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 765bca47c..0ba9887f3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
return (__be32 *)&ary->wc_array[nchunks];
}
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
{
+ struct rpcrdma_msg *rmsgp;
__be32 *va, *vaend;
unsigned int len;
u32 hdr_len;
/* Verify that there's enough bytes for header + something */
- if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
dprintk("svcrdma: header too short = %d\n",
- rqstp->rq_arg.len);
+ rq_arg->len);
return -EINVAL;
}
+ rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
if (rmsgp->rm_vers != rpcrdma_version) {
dprintk("%s: bad version %u\n", __func__,
be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rqstp->rq_arg.head[0].iov_base = va;
+ rq_arg->head[0].iov_base = va;
len = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rqstp->rq_arg.head[0].iov_len -= len;
- if (len > rqstp->rq_arg.len)
+ rq_arg->head[0].iov_len -= len;
+ if (len > rq_arg->len)
return -EINVAL;
return len;
default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
* chunk list and a reply chunk list.
*/
va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
va = decode_read_list(va, vaend);
if (!va) {
dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
return -EINVAL;
}
- rqstp->rq_arg.head[0].iov_base = va;
+ rq_arg->head[0].iov_base = va;
hdr_len = (unsigned long)va - (unsigned long)rmsgp;
- rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+ rq_arg->head[0].iov_len -= hdr_len;
return hdr_len;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 3b24a646e..2c25606f2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -281,7 +281,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
}
atomic_inc(&xprt->sc_dma_used);
- n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->sg_nents)) {
pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
frmr->mr, n, frmr->sg_nents);
@@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
head->arg.len = rqstp->rq_arg.len;
head->arg.buflen = rqstp->rq_arg.buflen;
- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- position = be32_to_cpu(ch->rc_position);
-
/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ position = be32_to_cpu(ch->rc_position);
if (position == 0) {
head->arg.pages = &head->pages[0];
page_offset = head->byte_len;
@@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
if (page_offset & 3) {
u32 pad = 4 - (page_offset & 3);
- head->arg.page_len += pad;
+ head->arg.tail[0].iov_len += pad;
head->arg.len += pad;
head->arg.buflen += pad;
page_offset += pad;
@@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
return ret;
}
-static int rdma_read_complete(struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head)
+static void rdma_read_complete(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head)
{
int page_no;
- int ret;
/* Copy RPC pages */
for (page_no = 0; page_no < head->count; page_no++) {
@@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0] = head->arg.tail[0];
rqstp->rq_arg.len = head->arg.len;
rqstp->rq_arg.buflen = head->arg.buflen;
-
- /* Free the context */
- svc_rdma_put_context(head, 0);
-
- /* XXX: What should this be? */
- rqstp->rq_prot = IPPROTO_MAX;
- svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
-
- ret = rqstp->rq_arg.head[0].iov_len
- + rqstp->rq_arg.page_len
- + rqstp->rq_arg.tail[0].iov_len;
- dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
- "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
- ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
- rqstp->rq_arg.head[0].iov_len);
-
- return ret;
}
/* By convention, backchannel calls arrive via rdma_msg type
@@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dto_q);
list_del_init(&ctxt->dto_q);
spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
- return rdma_read_complete(rqstp, ctxt);
+ rdma_read_complete(rqstp, ctxt);
+ goto complete;
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
struct svc_rdma_op_ctxt,
@@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
/* Decode the RDMA header. */
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
- ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+ ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
if (ret < 0)
goto out_err;
if (ret == 0)
@@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
return 0;
}
+complete:
ret = rqstp->rq_arg.head[0].iov_len
+ rqstp->rq_arg.page_len
+ rqstp->rq_arg.tail[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4f1b1c4f4..54d533300 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct svc_rqst *rqstp,
struct page *page,
struct rpcrdma_msg *rdma_resp,
- struct svc_rdma_op_ctxt *ctxt,
struct svc_rdma_req_map *vec,
int byte_count)
{
+ struct svc_rdma_op_ctxt *ctxt;
struct ib_send_wr send_wr;
u32 xdr_off;
int sge_no;
int sge_bytes;
int page_no;
int pages;
- int ret;
-
- ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
- if (ret) {
- svc_rdma_put_context(ctxt, 0);
- return -ENOTCONN;
- }
+ int ret = -EIO;
/* Prepare the context */
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->direction = DMA_TO_DEVICE;
ctxt->pages[0] = page;
ctxt->count = 1;
@@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
err:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
- pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
- return -EIO;
+ return ret;
}
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
int ret;
int inline_bytes;
struct page *res_page;
- struct svc_rdma_op_ctxt *ctxt;
struct svc_rdma_req_map *vec;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
/* Build an req vec for the XDR */
- ctxt = svc_rdma_get_context(rdma);
- ctxt->direction = DMA_TO_DEVICE;
vec = svc_rdma_get_req_map(rdma);
ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
if (ret)
@@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
inline_bytes -= ret;
}
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+ /* Post a fresh Receive buffer _before_ sending the reply */
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+ if (ret)
+ goto err1;
+
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
inline_bytes);
if (ret < 0)
goto err1;
@@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
put_page(res_page);
err0:
svc_rdma_put_req_map(rdma, vec);
- svc_rdma_put_context(ctxt, 0);
+ pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+ ret);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
return -ENOTCONN;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 90668969d..dd9440137 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
int ret;
dprintk("svcrdma: Creating RDMA socket\n");
- if (sa->sa_family != AF_INET) {
+ if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
return ERR_PTR(-EAFNOSUPPORT);
}
@@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
goto err0;
}
+ /* Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = rdma_set_afonly(listen_id, 1);
+ if (ret) {
+ dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+ goto err1;
+ }
+#endif
ret = rdma_bind_addr(listen_id, sa);
if (ret) {
dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
@@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
/* Post receive buffers */
- for (i = 0; i < newxprt->sc_rq_depth; i++) {
+ for (i = 0; i < newxprt->sc_max_requests; i++) {
ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
if (ret) {
dprintk("svcrdma: failure posting receive buffers\n");
@@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work)
dprintk("svcrdma: %s(%p)\n", __func__, rdma);
+ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+ ib_drain_qp(rdma->sc_qp);
+
/* We should only be called from kref_put */
if (atomic_read(&xprt->xpt_ref.refcount) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10..99d2e5b72 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
static unsigned int zero;
static unsigned int max_padding = PAGE_SIZE;
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
out:
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
req->rl_connect_cookie = 0; /* our reserved value */
+ req->rl_task = task;
return req->rl_sendbuf->rg_base;
out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
struct rpcrdma_req *req;
struct rpcrdma_xprt *r_xprt;
struct rpcrdma_regbuf *rb;
- int i;
if (buffer == NULL)
return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- for (i = 0; req->rl_nchunks;) {
- --req->rl_nchunks;
- i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[i]);
- }
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+ !RPC_IS_ASYNC(req->rl_task));
rpcrdma_buffer_put(req);
}
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
.bc_up = xprt_rdma_bc_up,
+ .bc_maxpayload = xprt_rdma_bc_maxpayload,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f5ed9f982..b044d98a1 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -203,15 +203,6 @@ out_fail:
goto out_schedule;
}
-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
- struct ib_wc wc;
-
- while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_receive_wc(NULL, &wc);
-}
-
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -374,23 +365,6 @@ out:
}
/*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
- struct ib_wc wc;
- int count = 0;
-
- while (1 == ib_poll_cq(cq, 1, &wc))
- ++count;
-
- if (count)
- dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
- __func__, count, wc.opcode);
-}
-
-/*
* Exported functions.
*/
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
- rwlock_init(&ia->ri_qplock);
return 0;
out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
__func__);
return -ENOMEM;
}
- max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
/* check provider's send/recv wr limits */
if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */
+ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
/* RPC/RDMA does not use private data */
ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_remote_cma.responder_resources =
ia->ri_device->attrs.max_qp_rd_atom;
- ep->rep_remote_cma.retry_count = 7;
+ /* Limit transport retries so client can detect server
+ * GID changes quickly. RPC layer handles re-establishing
+ * transport connection and retransmission.
+ */
+ ep->rep_remote_cma.retry_count = 6;
+
+ /* RPC-over-RDMA handles its own flow control. In addition,
+ * make all RNR NAKs visible so we know that RPC-over-RDMA
+ * flow control is working correctly (no NAKs should be seen).
+ */
ep->rep_remote_cma.flow_control = 0;
ep->rep_remote_cma.rnr_retry_count = 0;
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp)
- rpcrdma_ep_disconnect(ep, ia);
-
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
if (ia->ri_id->qp) {
+ rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -659,7 +639,6 @@ retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
rpcrdma_ep_disconnect(ep, ia);
- rpcrdma_flush_cqs(ep);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
goto out;
}
- write_lock(&ia->ri_qplock);
old = ia->ri_id;
ia->ri_id = id;
- write_unlock(&ia->ri_qplock);
rdma_destroy_qp(old);
rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
+
+ ib_drain_qp(ia->ri_id->qp);
}
struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
rpcrdma_recv_buffer_put(rep);
return rc;
}
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- int bytes, segments;
-
- bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
- bytes -= RPCRDMA_HDRLEN_MIN;
- if (bytes < sizeof(struct rpcrdma_segment) * 2) {
- pr_warn("RPC: %s: inline threshold too small\n",
- __func__);
- return 0;
- }
-
- segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
- dprintk("RPC: %s: max chunk list size = %d segments\n",
- __func__, segments);
- return segments;
-}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb..95cdc6622 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,7 +65,6 @@
*/
struct rpcrdma_ia {
const struct rpcrdma_memreg_ops *ri_ops;
- rwlock_t ri_qplock;
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
+ unsigned int ri_max_inline_write;
+ unsigned int ri_max_inline_read;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS (8)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
*/
#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
-#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+/* data segments + head/tail for Call + head/tail for Reply */
+#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
struct rpcrdma_buffer;
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct scatterlist *sg;
- int sg_nents;
+ struct scatterlist *fr_sg;
+ int fr_nents;
+ enum dma_data_direction fr_dir;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
struct completion fr_linv_done;
- struct work_struct fr_work;
- struct rpcrdma_xprt *fr_xprt;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
+ struct work_struct mw_work;
+ struct rpcrdma_xprt *mw_xprt;
struct list_head mw_list;
struct list_head mw_all;
};
@@ -270,12 +294,14 @@ struct rpcrdma_req {
unsigned int rl_niovs;
unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
+ struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe;
struct list_head rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_mr_seg *, int, bool);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
- int (*ro_unmap)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *);
+ void (*ro_unmap_safe)(struct rpcrdma_xprt *,
+ struct rpcrdma_req *, bool);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+ struct rpcrdma_create_data_internal *,
+ unsigned int);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e9e5dd0dc..7e2b2fa18 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -995,15 +995,14 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
u32 _xid;
__be32 *xp;
- repsize = skb->len - sizeof(struct udphdr);
+ repsize = skb->len;
if (repsize < 4) {
dprintk("RPC: impossible RPC reply size %d!\n", repsize);
return;
}
/* Copy the XID from the skb... */
- xp = skb_header_pointer(skb, sizeof(struct udphdr),
- sizeof(_xid), &_xid);
+ xp = skb_header_pointer(skb, 0, sizeof(_xid), &_xid);
if (xp == NULL)
return;
@@ -1019,11 +1018,11 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
/* Suck it into the iovec, verify checksum if not done by hw. */
if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
- UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS);
+ __UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
goto out_unlock;
}
- UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS);
+ __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
xprt_adjust_cwnd(xprt, task, copied);
xprt_complete_rqst(task, copied);
@@ -1365,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
return ret;
return 0;
}
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ return PAGE_SIZE;
+}
#else
static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
@@ -1881,8 +1885,7 @@ static inline void xs_reclassify_socket6(struct socket *sock)
static inline void xs_reclassify_socket(int family, struct socket *sock)
{
- WARN_ON_ONCE(sock_owned_by_user(sock->sk));
- if (sock_owned_by_user(sock->sk))
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sock->sk)))
return;
switch (family) {
@@ -1952,6 +1955,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
+ sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_NOIO;
@@ -2138,6 +2142,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
+ sock_set_flag(sk, SOCK_FASYNC);
sk->sk_allocation = GFP_NOIO;
xprt_set_connected(xprt);
@@ -2239,6 +2244,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_data_ready = xs_tcp_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
+ sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_NOIO;
@@ -2660,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
#ifdef CONFIG_SUNRPC_BACKCHANNEL
.bc_setup = xprt_setup_bc,
.bc_up = xs_tcp_bc_up,
+ .bc_maxpayload = xs_tcp_bc_maxpayload,
.bc_free_rqst = xprt_free_bc_rqst,
.bc_destroy = xprt_destroy_bc,
#endif
diff --git a/net/switchdev/Kconfig b/net/switchdev/Kconfig
index 86a47e17c..651fa201a 100644
--- a/net/switchdev/Kconfig
+++ b/net/switchdev/Kconfig
@@ -3,7 +3,7 @@
#
config NET_SWITCHDEV
- bool "Switch (and switch-ish) device support (EXPERIMENTAL)"
+ bool "Switch (and switch-ish) device support"
depends on INET
---help---
This module provides glue between core networking code and device
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 27a540621..a597708ae 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -205,6 +205,7 @@ static int tipc_enable_bearer(struct net *net, const char *name,
struct tipc_bearer *b;
struct tipc_media *m;
struct tipc_bearer_names b_names;
+ struct sk_buff *skb;
char addr_string[16];
u32 bearer_id;
u32 with_this_prio;
@@ -301,7 +302,7 @@ restart:
b->net_plane = bearer_id + 'A';
b->priority = priority;
- res = tipc_disc_create(net, b, &b->bcast_addr);
+ res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
if (res) {
bearer_disable(net, b);
pr_warn("Bearer <%s> rejected, discovery object creation failed\n",
@@ -310,7 +311,8 @@ restart:
}
rcu_assign_pointer(tn->bearer_list[bearer_id], b);
-
+ if (skb)
+ tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr);
pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
name,
tipc_addr_string_fill(addr_string, disc_domain), priority);
@@ -328,6 +330,21 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
return 0;
}
+/* tipc_bearer_reset_all - reset all links on all bearers
+ */
+void tipc_bearer_reset_all(struct net *net)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_bearer *b;
+ int i;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ b = rcu_dereference_rtnl(tn->bearer_list[i]);
+ if (b)
+ tipc_reset_bearer(net, b);
+ }
+}
+
/**
* bearer_disable
*
@@ -335,23 +352,16 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
*/
static void bearer_disable(struct net *net, struct tipc_bearer *b)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- u32 i;
+ struct tipc_net *tn = tipc_net(net);
+ int bearer_id = b->identity;
pr_info("Disabling bearer <%s>\n", b->name);
b->media->disable_media(b);
-
- tipc_node_delete_links(net, b->identity);
+ tipc_node_delete_links(net, bearer_id);
RCU_INIT_POINTER(b->media_ptr, NULL);
if (b->link_req)
tipc_disc_delete(b->link_req);
-
- for (i = 0; i < MAX_BEARERS; i++) {
- if (b == rtnl_dereference(tn->bearer_list[i])) {
- RCU_INIT_POINTER(tn->bearer_list[i], NULL);
- break;
- }
- }
+ RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL);
kfree_rcu(b, rcu);
}
@@ -394,7 +404,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b)
/**
* tipc_l2_send_msg - send a TIPC packet out over an L2 interface
- * @buf: the packet to be sent
+ * @skb: the packet to be sent
* @b: the bearer through which the packet is to be sent
* @dest: peer destination address
*/
@@ -403,17 +413,21 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
{
struct net_device *dev;
int delta;
+ void *tipc_ptr;
dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr);
if (!dev)
return 0;
+ /* Send RESET message even if bearer is detached from device */
+ tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr);
+ if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb))))
+ goto drop;
+
delta = dev->hard_header_len - skb_headroom(skb);
if ((delta > 0) &&
- pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
- kfree_skb(skb);
- return 0;
- }
+ pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
+ goto drop;
skb_reset_network_header(skb);
skb->dev = dev;
@@ -422,6 +436,9 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
dev->dev_addr, skb->len);
dev_queue_xmit(skb);
return 0;
+drop:
+ kfree_skb(skb);
+ return 0;
}
int tipc_bearer_mtu(struct net *net, u32 bearer_id)
@@ -450,6 +467,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
if (likely(b))
b->media->send_msg(net, skb, b, dest);
+ else
+ kfree_skb(skb);
rcu_read_unlock();
}
@@ -468,11 +487,11 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
rcu_read_lock();
b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (likely(b)) {
- skb_queue_walk_safe(xmitq, skb, tmp) {
- __skb_dequeue(xmitq);
- b->media->send_msg(net, skb, b, dst);
- }
+ if (unlikely(!b))
+ __skb_queue_purge(xmitq);
+ skb_queue_walk_safe(xmitq, skb, tmp) {
+ __skb_dequeue(xmitq);
+ b->media->send_msg(net, skb, b, dst);
}
rcu_read_unlock();
}
@@ -490,14 +509,14 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
rcu_read_lock();
b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (likely(b)) {
- skb_queue_walk_safe(xmitq, skb, tmp) {
- hdr = buf_msg(skb);
- msg_set_non_seq(hdr, 1);
- msg_set_mc_netid(hdr, net_id);
- __skb_dequeue(xmitq);
- b->media->send_msg(net, skb, b, &b->bcast_addr);
- }
+ if (unlikely(!b))
+ __skb_queue_purge(xmitq);
+ skb_queue_walk_safe(xmitq, skb, tmp) {
+ hdr = buf_msg(skb);
+ msg_set_non_seq(hdr, 1);
+ msg_set_mc_netid(hdr, net_id);
+ __skb_dequeue(xmitq);
+ b->media->send_msg(net, skb, b, &b->bcast_addr);
}
rcu_read_unlock();
}
@@ -513,24 +532,21 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
* ignores packets sent using interface multicast, and traffic sent to other
* nodes (which can happen if interface is running in promiscuous mode).
*/
-static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev,
+static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct tipc_bearer *b;
rcu_read_lock();
b = rcu_dereference_rtnl(dev->tipc_ptr);
- if (likely(b)) {
- if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
- buf->next = NULL;
- tipc_rcv(dev_net(dev), buf, b);
- rcu_read_unlock();
- return NET_RX_SUCCESS;
- }
+ if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) {
+ skb->next = NULL;
+ tipc_rcv(dev_net(dev), skb, b);
+ rcu_read_unlock();
+ return NET_RX_SUCCESS;
}
rcu_read_unlock();
-
- kfree_skb(buf);
+ kfree_skb(skb);
return NET_RX_DROP;
}
@@ -548,9 +564,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
+ struct tipc_net *tn = tipc_net(net);
struct tipc_bearer *b;
+ int i;
b = rtnl_dereference(dev->tipc_ptr);
+ if (!b) {
+ for (i = 0; i < MAX_BEARERS; b = NULL, i++) {
+ b = rtnl_dereference(tn->bearer_list[i]);
+ if (b && (b->media_ptr == dev))
+ break;
+ }
+ }
if (!b)
return NOTIFY_DONE;
@@ -560,13 +585,20 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
case NETDEV_CHANGE:
if (netif_carrier_ok(dev))
break;
+ case NETDEV_UP:
+ rcu_assign_pointer(dev->tipc_ptr, b);
+ break;
case NETDEV_GOING_DOWN:
+ RCU_INIT_POINTER(dev->tipc_ptr, NULL);
+ synchronize_net();
+ tipc_reset_bearer(net, b);
+ break;
case NETDEV_CHANGEMTU:
tipc_reset_bearer(net, b);
break;
case NETDEV_CHANGEADDR:
b->media->raw2addr(b, &b->addr,
- (char *)dev->dev_addr);
+ (char *)dev->dev_addr);
tipc_reset_bearer(net, b);
break;
case NETDEV_UNREGISTER:
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index e31820516..60e49c3be 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -42,8 +42,6 @@
#include <net/genetlink.h>
#define MAX_MEDIA 3
-#define MAX_NODES 4096
-#define WSIZE 32
/* Identifiers associated with TIPC message header media address info
* - address info field is 32 bytes long
@@ -62,16 +60,6 @@
#define TIPC_MEDIA_TYPE_UDP 3
/**
- * struct tipc_node_map - set of node identifiers
- * @count: # of nodes in set
- * @map: bitmap of node identifiers that are in the set
- */
-struct tipc_node_map {
- u32 count;
- u32 map[MAX_NODES / WSIZE];
-};
-
-/**
* struct tipc_media_addr - destination address used by TIPC bearers
* @value: address info (format defined by media)
* @media_id: TIPC media type identifier
@@ -142,7 +130,6 @@ struct tipc_media {
* @identity: array index of this bearer within TIPC bearer array
* @link_req: ptr to (optional) structure making periodic link setup requests
* @net_plane: network plane ('A' through 'H') currently associated with bearer
- * @nodes: indicates which nodes in cluster can be reached through bearer
*
* Note: media-specific code is responsible for initialization of the fields
* indicated below when a bearer is enabled; TIPC's generic bearer code takes
@@ -163,8 +150,6 @@ struct tipc_bearer {
u32 identity;
struct tipc_link_req *link_req;
char net_plane;
- int node_cnt;
- struct tipc_node_map nodes;
};
struct tipc_bearer_names {
@@ -213,6 +198,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest);
void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest);
struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name);
struct tipc_media *tipc_media_find(const char *name);
+void tipc_bearer_reset_all(struct net *net);
int tipc_bearer_setup(void);
void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index e2bdb07a4..fe1b062c4 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -112,11 +112,9 @@ static int __init tipc_init(void)
pr_info("Activated (version " TIPC_MOD_VER ")\n");
- sysctl_tipc_rmem[0] = TIPC_CONN_OVERLOAD_LIMIT >> 4 <<
- TIPC_LOW_IMPORTANCE;
- sysctl_tipc_rmem[1] = TIPC_CONN_OVERLOAD_LIMIT >> 4 <<
- TIPC_CRITICAL_IMPORTANCE;
- sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT;
+ sysctl_tipc_rmem[0] = RCVBUF_MIN;
+ sysctl_tipc_rmem[1] = RCVBUF_DEF;
+ sysctl_tipc_rmem[2] = RCVBUF_MAX;
err = tipc_netlink_start();
if (err)
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index f1e738e80..ad9d477cc 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -268,10 +268,9 @@ exit:
* Returns 0 if successful, otherwise -errno.
*/
int tipc_disc_create(struct net *net, struct tipc_bearer *b,
- struct tipc_media_addr *dest)
+ struct tipc_media_addr *dest, struct sk_buff **skb)
{
struct tipc_link_req *req;
- struct sk_buff *skb;
req = kmalloc(sizeof(*req), GFP_ATOMIC);
if (!req)
@@ -293,9 +292,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b,
setup_timer(&req->timer, disc_timeout, (unsigned long)req);
mod_timer(&req->timer, jiffies + req->timer_intv);
b->link_req = req;
- skb = skb_clone(req->buf, GFP_ATOMIC);
- if (skb)
- tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest);
+ *skb = skb_clone(req->buf, GFP_ATOMIC);
return 0;
}
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index c9b12770c..b80a33538 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -40,7 +40,7 @@
struct tipc_link_req;
int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
- struct tipc_media_addr *dest);
+ struct tipc_media_addr *dest, struct sk_buff **skb);
void tipc_disc_delete(struct tipc_link_req *req);
void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr);
void tipc_disc_add_dest(struct tipc_link_req *req);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7d2bb3e70..7d89f8713 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -140,6 +140,7 @@ struct tipc_link {
char if_name[TIPC_MAX_IF_NAME];
u32 priority;
char net_plane;
+ u16 rst_cnt;
/* Failover/synch */
u16 drop_point;
@@ -348,6 +349,8 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
u16 ack = snd_l->snd_nxt - 1;
snd_l->ackers--;
+ rcv_l->bc_peer_is_up = true;
+ rcv_l->state = LINK_ESTABLISHED;
tipc_link_bc_ack_rcv(rcv_l, ack, xmitq);
tipc_link_reset(rcv_l);
rcv_l->state = LINK_RESET;
@@ -701,40 +704,35 @@ static void link_profile_stats(struct tipc_link *l)
/* tipc_link_timeout - perform periodic task as instructed from node timeout
*/
-/* tipc_link_timeout - perform periodic task as instructed from node timeout
- */
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
{
+ int mtyp = 0;
int rc = 0;
- int mtyp = STATE_MSG;
- bool xmit = false;
- bool prb = false;
+ bool state = false;
+ bool probe = false;
+ bool setup = false;
u16 bc_snt = l->bc_sndlink->snd_nxt - 1;
u16 bc_acked = l->bc_rcvlink->acked;
- bool bc_up = link_is_up(l->bc_rcvlink);
link_profile_stats(l);
switch (l->state) {
case LINK_ESTABLISHED:
case LINK_SYNCHING:
- if (!l->silent_intv_cnt) {
- if (bc_up && (bc_acked != bc_snt))
- xmit = true;
- } else if (l->silent_intv_cnt <= l->abort_limit) {
- xmit = true;
- prb = true;
- } else {
- rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
- }
+ if (l->silent_intv_cnt > l->abort_limit)
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ mtyp = STATE_MSG;
+ state = bc_acked != bc_snt;
+ probe = l->silent_intv_cnt;
l->silent_intv_cnt++;
break;
case LINK_RESET:
- xmit = true;
+ setup = l->rst_cnt++ <= 4;
+ setup |= !(l->rst_cnt % 16);
mtyp = RESET_MSG;
break;
case LINK_ESTABLISHING:
- xmit = true;
+ setup = true;
mtyp = ACTIVATE_MSG;
break;
case LINK_PEER_RESET:
@@ -745,8 +743,8 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
break;
}
- if (xmit)
- tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq);
+ if (state || probe || setup)
+ tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, xmitq);
return rc;
}
@@ -833,6 +831,7 @@ void tipc_link_reset(struct tipc_link *l)
l->rcv_nxt = 1;
l->acked = 0;
l->silent_intv_cnt = 0;
+ l->rst_cnt = 0;
l->stats.recv_info = 0;
l->stale_count = 0;
l->bc_peer_is_up = false;
@@ -1110,12 +1109,12 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
return released;
}
-/* tipc_link_build_ack_msg: prepare link acknowledge message for transmission
+/* tipc_link_build_state_msg: prepare link state message for transmission
*
* Note that sending of broadcast ack is coordinated among nodes, to reduce
* risk of ack storms towards the sender
*/
-int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
+int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
{
if (!l)
return 0;
@@ -1140,11 +1139,17 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
{
int mtyp = RESET_MSG;
+ struct sk_buff *skb;
if (l->state == LINK_ESTABLISHING)
mtyp = ACTIVATE_MSG;
tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq);
+
+ /* Inform peer that this endpoint is going down if applicable */
+ skb = skb_peek_tail(xmitq);
+ if (skb && (l->state == LINK_RESET))
+ msg_set_peer_stopping(buf_msg(skb), 1);
}
/* tipc_link_build_nack_msg: prepare link nack message for transmission
@@ -1219,7 +1224,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
if (!tipc_data_input(l, skb, l->inputq))
rc |= tipc_link_input(l, skb, l->inputq);
if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
- rc |= tipc_link_build_ack_msg(l, xmitq);
+ rc |= tipc_link_build_state_msg(l, xmitq);
if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK))
break;
} while ((skb = __skb_dequeue(defq)));
@@ -1411,7 +1416,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
l->priority = peers_prio;
/* ACTIVATE_MSG serves as PEER_RESET if link is already down */
- if ((mtyp == RESET_MSG) || !link_is_up(l))
+ if (msg_peer_stopping(hdr))
+ rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ else if ((mtyp == RESET_MSG) || !link_is_up(l))
rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT);
/* ACTIVATE_MSG takes up link if it was already locally reset */
@@ -1554,7 +1561,12 @@ void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
if (!msg_peer_node_is_up(hdr))
return;
- l->bc_peer_is_up = true;
+ /* Open when peer ackowledges our bcast init msg (pkt #1) */
+ if (msg_ack(hdr))
+ l->bc_peer_is_up = true;
+
+ if (!l->bc_peer_is_up)
+ return;
/* Ignore if peers_snd_nxt goes beyond receive window */
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 6a94175ee..d7e9d42fc 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -123,7 +123,7 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq);
int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *xmitq);
-int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
+int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
void tipc_link_add_bc_peer(struct tipc_link *snd_l,
struct tipc_link *uc_l,
struct sk_buff_head *xmitq);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 8740930f0..17201aa84 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -41,6 +41,8 @@
#include "name_table.h"
#define MAX_FORWARD_SIZE 1024
+#define BUF_HEADROOM (LL_MAX_HEADER + 48)
+#define BUF_TAILROOM 16
static unsigned int align(unsigned int i)
{
@@ -505,6 +507,10 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
+ if (skb_cloned(_skb) &&
+ pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL))
+ goto exit;
+
/* Now reverse the concerned fields */
msg_set_errcode(hdr, err);
msg_set_origport(hdr, msg_destport(&ohdr));
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 55778a0ae..7cf52fb39 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -94,17 +94,6 @@ struct plist;
#define TIPC_MEDIA_INFO_OFFSET 5
-/**
- * TIPC message buffer code
- *
- * TIPC message buffer headroom reserves space for the worst-case
- * link-level device header (in case the message is sent off-node).
- *
- * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields
- * are word aligned for quicker access
- */
-#define BUF_HEADROOM (LL_MAX_HEADER + 48)
-
struct tipc_skb_cb {
void *handle;
struct sk_buff *tail;
@@ -715,6 +704,16 @@ static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r)
msg_set_bits(m, 5, 12, 0x1, r);
}
+static inline u32 msg_peer_stopping(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 13, 0x1);
+}
+
+static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s)
+{
+ msg_set_bits(m, 5, 13, 0x1, s);
+}
+
static inline char *msg_media_addr(struct tipc_msg *m)
{
return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET];
@@ -733,16 +732,26 @@ static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n)
msg_set_bits(m, 9, 16, 0xffff, n);
}
-static inline u32 msg_bcast_tag(struct tipc_msg *m)
+static inline u32 msg_conn_ack(struct tipc_msg *m)
{
return msg_bits(m, 9, 16, 0xffff);
}
-static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
+static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n)
{
msg_set_bits(m, 9, 16, 0xffff, n);
}
+static inline u32 msg_adv_win(struct tipc_msg *m)
+{
+ return msg_bits(m, 9, 0, 0xffff);
+}
+
+static inline void msg_set_adv_win(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 9, 0, 0xffff, n);
+}
+
static inline u32 msg_max_pkt(struct tipc_msg *m)
{
return msg_bits(m, 9, 16, 0xffff) * 4;
@@ -779,6 +788,11 @@ static inline bool msg_peer_node_is_up(struct tipc_msg *m)
return msg_redundant_link(m);
}
+static inline bool msg_is_reset(struct tipc_msg *hdr)
+{
+ return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
+}
+
struct sk_buff *tipc_buf_acquire(u32 size);
bool tipc_msg_validate(struct sk_buff *skb);
bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 4dfc5c14f..1fd464764 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -346,9 +346,15 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg,
struct nlattr **attrs)
{
struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1];
+ int err;
+
+ if (!attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
- nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER],
- NULL);
+ err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX,
+ attrs[TIPC_NLA_BEARER], NULL);
+ if (err)
+ return err;
return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME,
nla_data(bearer[TIPC_NLA_BEARER_NAME]),
@@ -460,14 +466,31 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
struct nlattr *prop[TIPC_NLA_PROP_MAX + 1];
struct nlattr *stats[TIPC_NLA_STATS_MAX + 1];
+ int err;
- nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+ if (!attrs[TIPC_NLA_LINK])
+ return -EINVAL;
- nla_parse_nested(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP],
- NULL);
+ err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+ NULL);
+ if (err)
+ return err;
+
+ if (!link[TIPC_NLA_LINK_PROP])
+ return -EINVAL;
- nla_parse_nested(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS],
- NULL);
+ err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX,
+ link[TIPC_NLA_LINK_PROP], NULL);
+ if (err)
+ return err;
+
+ if (!link[TIPC_NLA_LINK_STATS])
+ return -EINVAL;
+
+ err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX,
+ link[TIPC_NLA_LINK_STATS], NULL);
+ if (err)
+ return err;
name = (char *)TLV_DATA(msg->req);
if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
@@ -569,12 +592,20 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
{
struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
struct tipc_link_info link_info;
+ int err;
- nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+ if (!attrs[TIPC_NLA_LINK])
+ return -EINVAL;
+
+ err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+ NULL);
+ if (err)
+ return err;
link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
- strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]));
+ nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME],
+ TIPC_MAX_LINK_NAME);
return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
&link_info, sizeof(link_info));
@@ -758,12 +789,23 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg,
u32 node, depth, type, lowbound, upbound;
static const char * const scope_str[] = {"", " zone", " cluster",
" node"};
+ int err;
- nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
- attrs[TIPC_NLA_NAME_TABLE], NULL);
+ if (!attrs[TIPC_NLA_NAME_TABLE])
+ return -EINVAL;
- nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL],
- NULL);
+ err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
+ attrs[TIPC_NLA_NAME_TABLE], NULL);
+ if (err)
+ return err;
+
+ if (!nt[TIPC_NLA_NAME_TABLE_PUBL])
+ return -EINVAL;
+
+ err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX,
+ nt[TIPC_NLA_NAME_TABLE_PUBL], NULL);
+ if (err)
+ return err;
ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
@@ -815,8 +857,15 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg,
{
u32 type, lower, upper;
struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1];
+ int err;
- nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL);
+ if (!attrs[TIPC_NLA_PUBL])
+ return -EINVAL;
+
+ err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL],
+ NULL);
+ if (err)
+ return err;
type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]);
lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]);
@@ -876,7 +925,13 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg,
u32 sock_ref;
struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
- nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL);
+ if (!attrs[TIPC_NLA_SOCK])
+ return -EINVAL;
+
+ err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK],
+ NULL);
+ if (err)
+ return err;
sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
tipc_tlv_sprintf(msg->rep, "%u:", sock_ref);
@@ -917,9 +972,15 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg,
struct nlattr **attrs)
{
struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1];
+ int err;
+
+ if (!attrs[TIPC_NLA_MEDIA])
+ return -EINVAL;
- nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
- NULL);
+ err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
+ NULL);
+ if (err)
+ return err;
return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME,
nla_data(media[TIPC_NLA_MEDIA_NAME]),
@@ -931,8 +992,15 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg,
{
struct tipc_node_info node_info;
struct nlattr *node[TIPC_NLA_NODE_MAX + 1];
+ int err;
- nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL);
+ if (!attrs[TIPC_NLA_NODE])
+ return -EINVAL;
+
+ err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE],
+ NULL);
+ if (err)
+ return err;
node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR]));
node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP]));
@@ -971,8 +1039,16 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg,
{
__be32 id;
struct nlattr *net[TIPC_NLA_NET_MAX + 1];
+ int err;
+
+ if (!attrs[TIPC_NLA_NET])
+ return -EINVAL;
+
+ err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET],
+ NULL);
+ if (err)
+ return err;
- nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL);
id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID]));
return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id));
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9aaa1bc56..23d476184 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1,7 +1,7 @@
/*
* net/tipc/node.c: TIPC node management routines
*
- * Copyright (c) 2000-2006, 2012-2015, Ericsson AB
+ * Copyright (c) 2000-2006, 2012-2016, Ericsson AB
* Copyright (c) 2005-2006, 2010-2014, Wind River Systems
* All rights reserved.
*
@@ -191,6 +191,20 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
tipc_node_put(n);
return mtu;
}
+
+u16 tipc_node_get_capabilities(struct net *net, u32 addr)
+{
+ struct tipc_node *n;
+ u16 caps;
+
+ n = tipc_node_find(net, addr);
+ if (unlikely(!n))
+ return TIPC_NODE_CAPABILITIES;
+ caps = n->capabilities;
+ tipc_node_put(n);
+ return caps;
+}
+
/*
* A trivial power-of-two bitmask technique is used for speed, since this
* operation is done for every incoming TIPC packet. The number of hash table
@@ -304,8 +318,11 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
spin_lock_bh(&tn->node_list_lock);
n = tipc_node_find(net, addr);
- if (n)
+ if (n) {
+ /* Same node may come back with new capabilities */
+ n->capabilities = capabilities;
goto exit;
+ }
n = kzalloc(sizeof(*n), GFP_ATOMIC);
if (!n) {
pr_warn("Node creation failed, no memory\n");
@@ -525,7 +542,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
struct tipc_link *ol = node_active_link(n, 0);
struct tipc_link *nl = n->links[bearer_id].link;
- if (!nl)
+ if (!nl || tipc_link_is_up(nl))
return;
tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT);
@@ -545,12 +562,16 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
pr_debug("Established link <%s> on network plane %c\n",
tipc_link_name(nl), tipc_link_plane(nl));
+ /* Ensure that a STATE message goes first */
+ tipc_link_build_state_msg(nl, xmitq);
+
/* First link? => give it both slots */
if (!ol) {
*slot0 = bearer_id;
*slot1 = bearer_id;
tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT);
n->action_flags |= TIPC_NOTIFY_NODE_UP;
+ tipc_link_set_active(nl, true);
tipc_bcast_add_peer(n->net, nl, xmitq);
return;
}
@@ -581,8 +602,12 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
struct sk_buff_head *xmitq)
{
+ struct tipc_media_addr *maddr;
+
tipc_node_write_lock(n);
__tipc_node_link_up(n, bearer_id, xmitq);
+ maddr = &n->links[bearer_id].maddr;
+ tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr);
tipc_node_write_unlock(n);
}
@@ -1272,14 +1297,10 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
rc = tipc_bcast_rcv(net, be->link, skb);
- /* Broadcast link reset may happen at reassembly failure */
- if (rc & TIPC_LINK_DOWN_EVT)
- tipc_node_reset_links(n);
-
/* Broadcast ACKs are sent on a unicast link */
if (rc & TIPC_LINK_SND_BC_ACK) {
tipc_node_read_lock(n);
- tipc_link_build_ack_msg(le->link, &xmitq);
+ tipc_link_build_state_msg(le->link, &xmitq);
tipc_node_read_unlock(n);
}
@@ -1295,6 +1316,17 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
spin_unlock_bh(&be->inputq2.lock);
tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
}
+
+ if (rc & TIPC_LINK_DOWN_EVT) {
+ /* Reception reassembly failure => reset all links to peer */
+ if (!tipc_link_is_up(be->link))
+ tipc_node_reset_links(n);
+
+ /* Retransmission failure => reset all links to all peers */
+ if (!tipc_link_is_up(tipc_bc_sndlink(net)))
+ tipc_bearer_reset_all(net);
+ }
+
tipc_node_put(n);
}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index f39d9d06e..8264b3d97 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -45,10 +45,11 @@
/* Optional capabilities supported by this code version
*/
enum {
- TIPC_BCAST_SYNCH = (1 << 1)
+ TIPC_BCAST_SYNCH = (1 << 1),
+ TIPC_BLOCK_FLOWCTL = (2 << 1)
};
-#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH
+#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL)
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
@@ -70,6 +71,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb);
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel);
+u16 tipc_node_get_capabilities(struct net *net, u32 addr);
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 2446bfbaa..272d20a79 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -86,6 +86,7 @@ struct outqueue_entry {
static void tipc_recv_work(struct work_struct *work);
static void tipc_send_work(struct work_struct *work);
static void tipc_clean_outqueues(struct tipc_conn *con);
+static void tipc_sock_release(struct tipc_conn *con);
static void tipc_conn_kref_release(struct kref *kref)
{
@@ -102,6 +103,7 @@ static void tipc_conn_kref_release(struct kref *kref)
}
saddr->scope = -TIPC_NODE_SCOPE;
kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
+ tipc_sock_release(con);
sock_release(sock);
con->sock = NULL;
}
@@ -136,28 +138,28 @@ static void sock_data_ready(struct sock *sk)
{
struct tipc_conn *con;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con && test_bit(CF_CONNECTED, &con->flags)) {
conn_get(con);
if (!queue_work(con->server->rcv_wq, &con->rwork))
conn_put(con);
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void sock_write_space(struct sock *sk)
{
struct tipc_conn *con;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con && test_bit(CF_CONNECTED, &con->flags)) {
conn_get(con);
if (!queue_work(con->server->send_wq, &con->swork))
conn_put(con);
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
@@ -184,26 +186,31 @@ static void tipc_unregister_callbacks(struct tipc_conn *con)
write_unlock_bh(&sk->sk_callback_lock);
}
+static void tipc_sock_release(struct tipc_conn *con)
+{
+ struct tipc_server *s = con->server;
+
+ if (con->conid)
+ s->tipc_conn_release(con->conid, con->usr_data);
+
+ tipc_unregister_callbacks(con);
+}
+
static void tipc_close_conn(struct tipc_conn *con)
{
struct tipc_server *s = con->server;
if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
- if (con->conid)
- s->tipc_conn_shutdown(con->conid, con->usr_data);
spin_lock_bh(&s->idr_lock);
idr_remove(&s->conn_idr, con->conid);
s->idr_in_use--;
spin_unlock_bh(&s->idr_lock);
- tipc_unregister_callbacks(con);
-
/* We shouldn't flush pending works as we may be in the
* thread. In fact the races with pending rx/tx work structs
* are harmless for us here as we have already deleted this
- * connection from server connection list and set
- * sk->sk_user_data to 0 before releasing connection object.
+ * connection from server connection list.
*/
kernel_sock_shutdown(con->sock, SHUT_RDWR);
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 9015faedb..34f8055af 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -53,7 +53,7 @@
* @send_wq: send workqueue
* @max_rcvbuf_size: maximum permitted receive message length
* @tipc_conn_new: callback will be called when new connection is incoming
- * @tipc_conn_shutdown: callback will be called when connection is shut down
+ * @tipc_conn_release: callback will be called before releasing the connection
* @tipc_conn_recvmsg: callback will be called when message arrives
* @saddr: TIPC server address
* @name: server name
@@ -70,7 +70,7 @@ struct tipc_server {
struct workqueue_struct *send_wq;
int max_rcvbuf_size;
void *(*tipc_conn_new)(int conid);
- void (*tipc_conn_shutdown)(int conid, void *usr_data);
+ void (*tipc_conn_release)(int conid, void *usr_data);
void (*tipc_conn_recvmsg)(struct net *net, int conid,
struct sockaddr_tipc *addr, void *usr_data,
void *buf, size_t len);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 5f80d3fa9..c49b8df43 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -96,8 +96,11 @@ struct tipc_sock {
uint conn_timeout;
atomic_t dupl_rcvcnt;
bool link_cong;
- uint sent_unacked;
- uint rcv_unacked;
+ u16 snt_unacked;
+ u16 snd_win;
+ u16 peer_caps;
+ u16 rcv_unacked;
+ u16 rcv_win;
struct sockaddr_tipc remote;
struct rhash_head node;
struct rcu_head rcu;
@@ -227,9 +230,29 @@ static struct tipc_sock *tipc_sk(const struct sock *sk)
return container_of(sk, struct tipc_sock, sk);
}
-static int tsk_conn_cong(struct tipc_sock *tsk)
+static bool tsk_conn_cong(struct tipc_sock *tsk)
{
- return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN;
+ return tsk->snt_unacked >= tsk->snd_win;
+}
+
+/* tsk_blocks(): translate a buffer size in bytes to number of
+ * advertisable blocks, taking into account the ratio truesize(len)/len
+ * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ
+ */
+static u16 tsk_adv_blocks(int len)
+{
+ return len / FLOWCTL_BLK_SZ / 4;
+}
+
+/* tsk_inc(): increment counter for sent or received data
+ * - If block based flow control is not supported by peer we
+ * fall back to message based ditto, incrementing the counter
+ */
+static u16 tsk_inc(struct tipc_sock *tsk, int msglen)
+{
+ if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL))
+ return ((msglen / FLOWCTL_BLK_SZ) + 1);
+ return 1;
}
/**
@@ -366,7 +389,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
sock->state = state;
sock_init_data(sock, sk);
if (tipc_sk_insert(tsk)) {
- pr_warn("Socket create failed; port numbrer exhausted\n");
+ pr_warn("Socket create failed; port number exhausted\n");
return -EINVAL;
}
msg_set_origport(msg, tsk->portid);
@@ -377,9 +400,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
sk->sk_write_space = tipc_write_space;
sk->sk_destruct = tipc_sock_destruct;
tsk->conn_timeout = CONN_TIMEOUT_DEFAULT;
- tsk->sent_unacked = 0;
atomic_set(&tsk->dupl_rcvcnt, 0);
+ /* Start out with safe limits until we receive an advertised window */
+ tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN);
+ tsk->rcv_win = tsk->snd_win;
+
if (sock->state == SS_READY) {
tsk_set_unreturnable(tsk, true);
if (sock->type == SOCK_DGRAM)
@@ -770,12 +796,14 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
* @tsk: receiving socket
* @skb: pointer to message buffer.
*/
-static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
+static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
struct sock *sk = &tsk->sk;
+ u32 onode = tsk_own_node(tsk);
struct tipc_msg *hdr = buf_msg(skb);
int mtyp = msg_type(hdr);
- int conn_cong;
+ bool conn_cong;
/* Ignore if connection cannot be validated: */
if (!tsk_peer_msg(tsk, hdr))
@@ -785,11 +813,14 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
if (mtyp == CONN_PROBE) {
msg_set_type(hdr, CONN_PROBE_REPLY);
- tipc_sk_respond(sk, skb, TIPC_OK);
+ if (tipc_msg_reverse(onode, &skb, TIPC_OK))
+ __skb_queue_tail(xmitq, skb);
return;
} else if (mtyp == CONN_ACK) {
conn_cong = tsk_conn_cong(tsk);
- tsk->sent_unacked -= msg_msgcnt(hdr);
+ tsk->snt_unacked -= msg_conn_ack(hdr);
+ if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
+ tsk->snd_win = msg_adv_win(hdr);
if (conn_cong)
sk->sk_write_space(sk);
} else if (mtyp != CONN_PROBE_REPLY) {
@@ -1020,12 +1051,14 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
u32 dnode;
uint mtu, send, sent = 0;
struct iov_iter save;
+ int hlen = MIN_H_SIZE;
/* Handle implied connection establishment */
if (unlikely(dest)) {
rc = __tipc_sendmsg(sock, m, dsz);
+ hlen = msg_hdr_sz(mhdr);
if (dsz && (dsz == rc))
- tsk->sent_unacked = 1;
+ tsk->snt_unacked = tsk_inc(tsk, dsz + hlen);
return rc;
}
if (dsz > (uint)INT_MAX)
@@ -1054,7 +1087,7 @@ next:
if (likely(!tsk_conn_cong(tsk))) {
rc = tipc_node_xmit(net, &pktchain, dnode, portid);
if (likely(!rc)) {
- tsk->sent_unacked++;
+ tsk->snt_unacked += tsk_inc(tsk, send + hlen);
sent += send;
if (sent == dsz)
return dsz;
@@ -1118,6 +1151,13 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
+ tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
+ if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
+ return;
+
+ /* Fall back to message based flow control */
+ tsk->rcv_win = FLOWCTL_MSG_WIN;
+ tsk->snd_win = FLOWCTL_MSG_WIN;
}
/**
@@ -1214,7 +1254,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
return 0;
}
-static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack)
+static void tipc_sk_send_ack(struct tipc_sock *tsk)
{
struct net *net = sock_net(&tsk->sk);
struct sk_buff *skb = NULL;
@@ -1230,7 +1270,14 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack)
if (!skb)
return;
msg = buf_msg(skb);
- msg_set_msgcnt(msg, ack);
+ msg_set_conn_ack(msg, tsk->rcv_unacked);
+ tsk->rcv_unacked = 0;
+
+ /* Adjust to and advertize the correct window limit */
+ if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) {
+ tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf);
+ msg_set_adv_win(msg, tsk->rcv_win);
+ }
tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg));
}
@@ -1288,7 +1335,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
long timeo;
unsigned int sz;
u32 err;
- int res;
+ int res, hlen;
/* Catch invalid receive requests */
if (unlikely(!buf_len))
@@ -1313,6 +1360,7 @@ restart:
buf = skb_peek(&sk->sk_receive_queue);
msg = buf_msg(buf);
sz = msg_data_sz(msg);
+ hlen = msg_hdr_sz(msg);
err = msg_errcode(msg);
/* Discard an empty non-errored message & try again */
@@ -1335,7 +1383,7 @@ restart:
sz = buf_len;
m->msg_flags |= MSG_TRUNC;
}
- res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz);
+ res = skb_copy_datagram_msg(buf, hlen, m, sz);
if (res)
goto exit;
res = sz;
@@ -1347,15 +1395,15 @@ restart:
res = -ECONNRESET;
}
- /* Consume received message (optional) */
- if (likely(!(flags & MSG_PEEK))) {
- if ((sock->state != SS_READY) &&
- (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) {
- tipc_sk_send_ack(tsk, tsk->rcv_unacked);
- tsk->rcv_unacked = 0;
- }
- tsk_advance_rx_queue(sk);
+ if (unlikely(flags & MSG_PEEK))
+ goto exit;
+
+ if (likely(sock->state != SS_READY)) {
+ tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
+ if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
+ tipc_sk_send_ack(tsk);
}
+ tsk_advance_rx_queue(sk);
exit:
release_sock(sk);
return res;
@@ -1384,7 +1432,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
int sz_to_copy, target, needed;
int sz_copied = 0;
u32 err;
- int res = 0;
+ int res = 0, hlen;
/* Catch invalid receive attempts */
if (unlikely(!buf_len))
@@ -1410,6 +1458,7 @@ restart:
buf = skb_peek(&sk->sk_receive_queue);
msg = buf_msg(buf);
sz = msg_data_sz(msg);
+ hlen = msg_hdr_sz(msg);
err = msg_errcode(msg);
/* Discard an empty non-errored message & try again */
@@ -1434,8 +1483,7 @@ restart:
needed = (buf_len - sz_copied);
sz_to_copy = (sz <= needed) ? sz : needed;
- res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset,
- m, sz_to_copy);
+ res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy);
if (res)
goto exit;
@@ -1457,20 +1505,18 @@ restart:
res = -ECONNRESET;
}
- /* Consume received message (optional) */
- if (likely(!(flags & MSG_PEEK))) {
- if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) {
- tipc_sk_send_ack(tsk, tsk->rcv_unacked);
- tsk->rcv_unacked = 0;
- }
- tsk_advance_rx_queue(sk);
- }
+ if (unlikely(flags & MSG_PEEK))
+ goto exit;
+
+ tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
+ if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
+ tipc_sk_send_ack(tsk);
+ tsk_advance_rx_queue(sk);
/* Loop around if more data is required */
if ((sz_copied < buf_len) && /* didn't get all requested data */
(!skb_queue_empty(&sk->sk_receive_queue) ||
(sz_copied < target)) && /* and more is ready or required */
- (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */
(!err)) /* and haven't reached a FIN */
goto restart;
@@ -1602,30 +1648,33 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
/**
* rcvbuf_limit - get proper overload limit of socket receive queue
* @sk: socket
- * @buf: message
+ * @skb: message
*
- * For all connection oriented messages, irrespective of importance,
- * the default overload value (i.e. 67MB) is set as limit.
+ * For connection oriented messages, irrespective of importance,
+ * default queue limit is 2 MB.
*
- * For all connectionless messages, by default new queue limits are
- * as belows:
+ * For connectionless messages, queue limits are based on message
+ * importance as follows:
*
- * TIPC_LOW_IMPORTANCE (4 MB)
- * TIPC_MEDIUM_IMPORTANCE (8 MB)
- * TIPC_HIGH_IMPORTANCE (16 MB)
- * TIPC_CRITICAL_IMPORTANCE (32 MB)
+ * TIPC_LOW_IMPORTANCE (2 MB)
+ * TIPC_MEDIUM_IMPORTANCE (4 MB)
+ * TIPC_HIGH_IMPORTANCE (8 MB)
+ * TIPC_CRITICAL_IMPORTANCE (16 MB)
*
* Returns overload limit according to corresponding message importance
*/
-static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
+static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
{
- struct tipc_msg *msg = buf_msg(buf);
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct tipc_msg *hdr = buf_msg(skb);
- if (msg_connected(msg))
- return sysctl_tipc_rmem[2];
+ if (unlikely(!msg_connected(hdr)))
+ return sk->sk_rcvbuf << msg_importance(hdr);
- return sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE <<
- msg_importance(msg);
+ if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL))
+ return sk->sk_rcvbuf;
+
+ return FLOWCTL_MSG_LIM;
}
/**
@@ -1640,7 +1689,8 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
*
* Returns true if message was added to socket receive queue, otherwise false
*/
-static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
+static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
@@ -1650,7 +1700,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
int usr = msg_user(hdr);
if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
- tipc_sk_proto_rcv(tsk, skb);
+ tipc_sk_proto_rcv(tsk, skb, xmitq);
return false;
}
@@ -1693,7 +1743,8 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
return true;
reject:
- tipc_sk_respond(sk, skb, err);
+ if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err))
+ __skb_queue_tail(xmitq, skb);
return false;
}
@@ -1709,9 +1760,24 @@ reject:
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
unsigned int truesize = skb->truesize;
+ struct sk_buff_head xmitq;
+ u32 dnode, selector;
+
+ __skb_queue_head_init(&xmitq);
- if (likely(filter_rcv(sk, skb)))
+ if (likely(filter_rcv(sk, skb, &xmitq))) {
atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt);
+ return 0;
+ }
+
+ if (skb_queue_empty(&xmitq))
+ return 0;
+
+ /* Send response/rejected message */
+ skb = __skb_dequeue(&xmitq);
+ dnode = msg_destnode(buf_msg(skb));
+ selector = msg_origport(buf_msg(skb));
+ tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
return 0;
}
@@ -1725,12 +1791,13 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
* Caller must hold socket lock
*/
static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
- u32 dport)
+ u32 dport, struct sk_buff_head *xmitq)
{
+ unsigned long time_limit = jiffies + 2;
+ struct sk_buff *skb;
unsigned int lim;
atomic_t *dcnt;
- struct sk_buff *skb;
- unsigned long time_limit = jiffies + 2;
+ u32 onode;
while (skb_queue_len(inputq)) {
if (unlikely(time_after_eq(jiffies, time_limit)))
@@ -1742,20 +1809,22 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
/* Add message directly to receive queue if possible */
if (!sock_owned_by_user(sk)) {
- filter_rcv(sk, skb);
+ filter_rcv(sk, skb, xmitq);
continue;
}
/* Try backlog, compensating for double-counted bytes */
dcnt = &tipc_sk(sk)->dupl_rcvcnt;
- if (sk->sk_backlog.len)
+ if (!sk->sk_backlog.len)
atomic_set(dcnt, 0);
lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
if (likely(!sk_add_backlog(sk, skb, lim)))
continue;
/* Overload => reject message back to sender */
- tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD);
+ onode = tipc_own_addr(sock_net(sk));
+ if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD))
+ __skb_queue_tail(xmitq, skb);
break;
}
}
@@ -1768,12 +1837,14 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
*/
void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
+ struct sk_buff_head xmitq;
u32 dnode, dport = 0;
int err;
struct tipc_sock *tsk;
struct sock *sk;
struct sk_buff *skb;
+ __skb_queue_head_init(&xmitq);
while (skb_queue_len(inputq)) {
dport = tipc_skb_peek_port(inputq, dport);
tsk = tipc_sk_lookup(net, dport);
@@ -1781,9 +1852,14 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
if (likely(tsk)) {
sk = &tsk->sk;
if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
- tipc_sk_enqueue(inputq, sk, dport);
+ tipc_sk_enqueue(inputq, sk, dport, &xmitq);
spin_unlock_bh(&sk->sk_lock.slock);
}
+ /* Send pending response/rejected messages, if any */
+ while ((skb = __skb_dequeue(&xmitq))) {
+ dnode = msg_destnode(buf_msg(skb));
+ tipc_node_xmit_skb(net, skb, dnode, dport);
+ }
sock_put(sk);
continue;
}
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index 4241f2206..06fb5944c 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -1,6 +1,6 @@
/* net/tipc/socket.h: Include file for TIPC socket code
*
- * Copyright (c) 2014-2015, Ericsson AB
+ * Copyright (c) 2014-2016, Ericsson AB
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -38,10 +38,17 @@
#include <net/sock.h>
#include <net/genetlink.h>
-#define TIPC_CONNACK_INTV 256
-#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2)
-#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \
- SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE))
+/* Compatibility values for deprecated message based flow control */
+#define FLOWCTL_MSG_WIN 512
+#define FLOWCTL_MSG_LIM ((FLOWCTL_MSG_WIN * 2 + 1) * SKB_TRUESIZE(MAX_MSG_SIZE))
+
+#define FLOWCTL_BLK_SZ 1024
+
+/* Socket receive buffer sizes */
+#define RCVBUF_MIN (FLOWCTL_BLK_SZ * 512)
+#define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2)
+#define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16)
+
int tipc_socket_init(void);
void tipc_socket_stop(void);
void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index e6cb386fb..0dd02244e 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -302,7 +302,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
}
/* Handle one termination request for the subscriber */
-static void tipc_subscrb_shutdown_cb(int conid, void *usr_data)
+static void tipc_subscrb_release_cb(int conid, void *usr_data)
{
tipc_subscrb_delete((struct tipc_subscriber *)usr_data);
}
@@ -326,8 +326,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
return tipc_subscrp_cancel(s, subscriber);
}
- if (s)
- tipc_subscrp_subscribe(net, s, subscriber, swap);
+ tipc_subscrp_subscribe(net, s, subscriber, swap);
}
/* Handle one request to establish a new subscriber */
@@ -365,7 +364,7 @@ int tipc_topsrv_start(struct net *net)
topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr);
topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb;
topsrv->tipc_conn_new = tipc_subscrb_connect_cb;
- topsrv->tipc_conn_shutdown = tipc_subscrb_shutdown_cb;
+ topsrv->tipc_conn_release = tipc_subscrb_release_cb;
strncpy(topsrv->name, name, strlen(name) + 1);
tn->topsrv = topsrv;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7748199b3..735362c26 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -953,7 +953,7 @@ fail:
return NULL;
}
-static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode,
+static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode,
struct path *res)
{
int err;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b5f1221f4..b96ac918e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -61,6 +61,14 @@
* function will also cleanup rejected sockets, those that reach the connected
* state but leave it before they have been accepted.
*
+ * - Lock ordering for pending or accept queue sockets is:
+ *
+ * lock_sock(listener);
+ * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
+ *
+ * Using explicit nested locking keeps lockdep happy since normally only one
+ * lock of a given class may be taken at a time.
+ *
* - Sockets created by user action will be cleaned up when the user process
* calls close(2), causing our release implementation to be called. Our release
* implementation will perform some cleanup then drop the last reference so our
@@ -443,7 +451,7 @@ void vsock_pending_work(struct work_struct *work)
cleanup = true;
lock_sock(listener);
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
@@ -1292,7 +1300,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
if (connected) {
listener->sk_ack_backlog--;
- lock_sock(connected);
+ lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
vconnected = vsock_sk(connected);
/* If the listener socket has received an error, then we should
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 56214736f..4120b7a53 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -2051,7 +2051,7 @@ static u32 vmci_transport_get_local_cid(void)
return vmci_get_context_id();
}
-static struct vsock_transport vmci_transport = {
+static const struct vsock_transport vmci_transport = {
.init = vmci_transport_socket_init,
.destruct = vmci_transport_destruct,
.release = vmci_transport_release,
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 59cabc9bc..da49c0b1f 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -739,7 +739,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
* and thus fail the GO instantiation, consider only the interfaces of
* the current registered device.
*/
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
struct ieee80211_channel *other_chan = NULL;
int r1, r2;
@@ -768,7 +768,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
if (chan == other_chan)
return true;
- if (chan->band != IEEE80211_BAND_5GHZ)
+ if (chan->band != NL80211_BAND_5GHZ)
continue;
r1 = cfg80211_get_unii(chan->center_freq);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c878045d1..ecca3896b 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -3,6 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2015 Intel Deutschland GmbH
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -157,7 +158,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK))
return -EOPNOTSUPP;
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (!wdev->netdev)
continue;
wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
@@ -171,7 +172,8 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
/* failed -- clean up to old netns */
net = wiphy_net(&rdev->wiphy);
- list_for_each_entry_continue_reverse(wdev, &rdev->wdev_list,
+ list_for_each_entry_continue_reverse(wdev,
+ &rdev->wiphy.wdev_list,
list) {
if (!wdev->netdev)
continue;
@@ -230,7 +232,7 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
ASSERT_RTNL();
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (wdev->netdev) {
dev_close(wdev->netdev);
continue;
@@ -298,7 +300,8 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev)
kfree(item);
spin_unlock_irq(&rdev->destroy_list_lock);
- list_for_each_entry_safe(wdev, tmp, &rdev->wdev_list, list) {
+ list_for_each_entry_safe(wdev, tmp,
+ &rdev->wiphy.wdev_list, list) {
if (nlportid == wdev->owner_nlportid)
rdev_del_virtual_intf(rdev, wdev);
}
@@ -408,7 +411,7 @@ use_default_name:
dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx);
}
- INIT_LIST_HEAD(&rdev->wdev_list);
+ INIT_LIST_HEAD(&rdev->wiphy.wdev_list);
INIT_LIST_HEAD(&rdev->beacon_registrations);
spin_lock_init(&rdev->beacon_registrations_lock);
spin_lock_init(&rdev->bss_lock);
@@ -555,7 +558,7 @@ int wiphy_register(struct wiphy *wiphy)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
int res;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_supported_band *sband;
bool have_band = false;
int i;
@@ -624,6 +627,13 @@ int wiphy_register(struct wiphy *wiphy)
!rdev->ops->set_mac_acl)))
return -EINVAL;
+ /* assure only valid behaviours are flagged by driver
+ * hence subtract 2 as bit 0 is invalid.
+ */
+ if (WARN_ON(wiphy->bss_select_support &&
+ (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2))))
+ return -EINVAL;
+
if (wiphy->addresses)
memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
@@ -638,7 +648,7 @@ int wiphy_register(struct wiphy *wiphy)
return res;
/* sanity check supported bands/channels */
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
sband = wiphy->bands[band];
if (!sband)
continue;
@@ -650,7 +660,7 @@ int wiphy_register(struct wiphy *wiphy)
* on 60GHz band, there are no legacy rates, so
* n_bitrates is 0
*/
- if (WARN_ON(band != IEEE80211_BAND_60GHZ &&
+ if (WARN_ON(band != NL80211_BAND_60GHZ &&
!sband->n_bitrates))
return -EINVAL;
@@ -660,7 +670,7 @@ int wiphy_register(struct wiphy *wiphy)
* global structure for that.
*/
if (cfg80211_disable_40mhz_24ghz &&
- band == IEEE80211_BAND_2GHZ &&
+ band == NL80211_BAND_2GHZ &&
sband->ht_cap.ht_supported) {
sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40;
@@ -790,7 +800,7 @@ void wiphy_unregister(struct wiphy *wiphy)
nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY);
rdev->wiphy.registered = false;
- WARN_ON(!list_empty(&rdev->wdev_list));
+ WARN_ON(!list_empty(&rdev->wiphy.wdev_list));
/*
* First remove the hardware from everywhere, this makes
@@ -1012,7 +1022,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
spin_lock_init(&wdev->mgmt_registrations_lock);
wdev->identifier = ++rdev->wdev_id;
- list_add_rcu(&wdev->list, &rdev->wdev_list);
+ list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
rdev->devlist_generation++;
/* can only change netns with wiphy */
dev->features |= NETIF_F_NETNS_LOCAL;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 022ccad06..025b7a5d5 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -50,10 +50,9 @@ struct cfg80211_registered_device {
/* wiphy index, internal only */
int wiphy_idx;
- /* associated wireless interfaces, protected by rtnl or RCU */
- struct list_head wdev_list;
+ /* protected by RTNL */
int devlist_generation, wdev_id;
- int opencount; /* also protected by devlist_mtx */
+ int opencount;
wait_queue_head_t dev_wait;
struct list_head beacon_registrations;
@@ -214,6 +213,7 @@ struct cfg80211_event {
const u8 *resp_ie;
size_t req_ie_len;
size_t resp_ie_len;
+ struct cfg80211_bss *bss;
u16 status;
} cr;
struct {
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 454157717..5d453916a 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -69,7 +69,7 @@ static ssize_t ht40allow_map_read(struct file *file,
struct wiphy *wiphy = file->private_data;
char *buf;
unsigned int offset = 0, buf_size = PAGE_SIZE, i, r;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_supported_band *sband;
buf = kzalloc(buf_size, GFP_KERNEL);
@@ -78,7 +78,7 @@ static ssize_t ht40allow_map_read(struct file *file,
rtnl_lock();
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
sband = wiphy->bands[band];
if (!sband)
continue;
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 4c55fab9b..4a4dda53b 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -104,7 +104,7 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
struct ieee80211_supported_band *sband =
rdev->wiphy.bands[params->chandef.chan->band];
int j;
- u32 flag = params->chandef.chan->band == IEEE80211_BAND_5GHZ ?
+ u32 flag = params->chandef.chan->band == NL80211_BAND_5GHZ ?
IEEE80211_RATE_MANDATORY_A :
IEEE80211_RATE_MANDATORY_B;
@@ -236,7 +236,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev)
{
struct cfg80211_cached_keys *ck = NULL;
- enum ieee80211_band band;
+ enum nl80211_band band;
int i, err;
ASSERT_WDEV_LOCK(wdev);
@@ -248,7 +248,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
if (!wdev->wext.ibss.chandef.chan) {
struct ieee80211_channel *new_chan = NULL;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
struct ieee80211_channel *chan;
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index 092300b30..fa2066b56 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -128,9 +128,9 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
if (!setup->chandef.chan) {
/* if we don't have that either, use the first usable channel */
- enum ieee80211_band band;
+ enum nl80211_band band;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
struct ieee80211_channel *chan;
int i;
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index ff328250b..c284d883c 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -726,7 +726,7 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work)
wiphy = &rdev->wiphy;
rtnl_lock();
- for (bandid = 0; bandid < IEEE80211_NUM_BANDS; bandid++) {
+ for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) {
sband = wiphy->bands[bandid];
if (!sband)
continue;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 056a73078..7d7228390 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -103,7 +103,7 @@ __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs)
if (have_wdev_id && rdev->wiphy_idx != wiphy_idx)
continue;
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (have_ifidx && wdev->netdev &&
wdev->netdev->ifindex == ifidx) {
result = wdev;
@@ -149,7 +149,7 @@ __cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
tmp = cfg80211_rdev_by_wiphy_idx(wdev_id >> 32);
if (tmp) {
/* make sure wdev exists */
- list_for_each_entry(wdev, &tmp->wdev_list, list) {
+ list_for_each_entry(wdev, &tmp->wiphy.wdev_list, list) {
if (wdev->identifier != (u32)wdev_id)
continue;
found = true;
@@ -402,6 +402,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
+ [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED },
+ [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 },
};
/* policy for the key attributes */
@@ -486,6 +488,15 @@ nl80211_plan_policy[NL80211_SCHED_SCAN_PLAN_MAX + 1] = {
[NL80211_SCHED_SCAN_PLAN_ITERATIONS] = { .type = NLA_U32 },
};
+static const struct nla_policy
+nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
+ [NL80211_BSS_SELECT_ATTR_RSSI] = { .type = NLA_FLAG },
+ [NL80211_BSS_SELECT_ATTR_BAND_PREF] = { .type = NLA_U32 },
+ [NL80211_BSS_SELECT_ATTR_RSSI_ADJUST] = {
+ .len = sizeof(struct nl80211_bss_select_rssi_adjust)
+ },
+};
+
static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct cfg80211_registered_device **rdev,
@@ -524,7 +535,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
*rdev = wiphy_to_rdev(wiphy);
*wdev = NULL;
- list_for_each_entry(tmp, &(*rdev)->wdev_list, list) {
+ list_for_each_entry(tmp, &(*rdev)->wiphy.wdev_list, list) {
if (tmp->identifier == cb->args[1]) {
*wdev = tmp;
break;
@@ -1266,7 +1277,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
struct nlattr *nl_bands, *nl_band;
struct nlattr *nl_freqs, *nl_freq;
struct nlattr *nl_cmds;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_channel *chan;
int i;
const struct ieee80211_txrx_stypes *mgmt_stypes =
@@ -1399,7 +1410,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
goto nla_put_failure;
for (band = state->band_start;
- band < IEEE80211_NUM_BANDS; band++) {
+ band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
sband = rdev->wiphy.bands[band];
@@ -1461,7 +1472,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
}
nla_nest_end(msg, nl_bands);
- if (band < IEEE80211_NUM_BANDS)
+ if (band < NUM_NL80211_BANDS)
state->band_start = band + 1;
else
state->band_start = 0;
@@ -1731,6 +1742,25 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
rdev->wiphy.ext_features))
goto nla_put_failure;
+ if (rdev->wiphy.bss_select_support) {
+ struct nlattr *nested;
+ u32 bss_select_support = rdev->wiphy.bss_select_support;
+
+ nested = nla_nest_start(msg, NL80211_ATTR_BSS_SELECT);
+ if (!nested)
+ goto nla_put_failure;
+
+ i = 0;
+ while (bss_select_support) {
+ if ((bss_select_support & 1) &&
+ nla_put_flag(msg, i))
+ goto nla_put_failure;
+ i++;
+ bss_select_support >>= 1;
+ }
+ nla_nest_end(msg, nested);
+ }
+
/* done */
state->split_start = 0;
break;
@@ -2399,7 +2429,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
nla_put_u32(msg, NL80211_ATTR_IFTYPE, wdev->iftype) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD) ||
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) ||
nla_put_u32(msg, NL80211_ATTR_GENERATION,
rdev->devlist_generation ^
@@ -2459,7 +2490,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
}
if_idx = 0;
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (if_idx < if_start) {
if_idx++;
continue;
@@ -2731,7 +2762,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
spin_lock_init(&wdev->mgmt_registrations_lock);
wdev->identifier = ++rdev->wdev_id;
- list_add_rcu(&wdev->list, &rdev->wdev_list);
+ list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
rdev->devlist_generation++;
break;
default:
@@ -3267,7 +3298,7 @@ static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev;
bool ret = false;
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (wdev->iftype != NL80211_IFTYPE_AP &&
wdev->iftype != NL80211_IFTYPE_P2P_GO)
continue;
@@ -3456,16 +3487,16 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.smps_mode = NL80211_SMPS_OFF;
}
+ params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+ if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ])
+ return -EOPNOTSUPP;
+
if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
params.acl = parse_acl_data(&rdev->wiphy, info);
if (IS_ERR(params.acl))
return PTR_ERR(params.acl);
}
- params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
- if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ])
- return -EOPNOTSUPP;
-
wdev_lock(wdev);
err = rdev_start_ap(rdev, dev, &params);
if (!err) {
@@ -3724,11 +3755,18 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
goto nla_put_failure;
#define PUT_SINFO(attr, memb, type) do { \
- if (sinfo->filled & BIT(NL80211_STA_INFO_ ## attr) && \
+ BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \
+ if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \
nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \
sinfo->memb)) \
goto nla_put_failure; \
} while (0)
+#define PUT_SINFO_U64(attr, memb) do { \
+ if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \
+ nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \
+ sinfo->memb, NL80211_STA_INFO_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
PUT_SINFO(CONNECTED_TIME, connected_time, u32);
PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
@@ -3745,11 +3783,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
(u32)sinfo->tx_bytes))
goto nla_put_failure;
- PUT_SINFO(RX_BYTES64, rx_bytes, u64);
- PUT_SINFO(TX_BYTES64, tx_bytes, u64);
+ PUT_SINFO_U64(RX_BYTES64, rx_bytes);
+ PUT_SINFO_U64(TX_BYTES64, tx_bytes);
PUT_SINFO(LLID, llid, u16);
PUT_SINFO(PLID, plid, u16);
PUT_SINFO(PLINK_STATE, plink_state, u8);
+ PUT_SINFO_U64(RX_DURATION, rx_duration);
switch (rdev->wiphy.signal_type) {
case CFG80211_SIGNAL_TYPE_MBM:
@@ -3817,12 +3856,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
&sinfo->sta_flags))
goto nla_put_failure;
- PUT_SINFO(T_OFFSET, t_offset, u64);
- PUT_SINFO(RX_DROP_MISC, rx_dropped_misc, u64);
- PUT_SINFO(BEACON_RX, rx_beacon, u64);
+ PUT_SINFO_U64(T_OFFSET, t_offset);
+ PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
+ PUT_SINFO_U64(BEACON_RX, rx_beacon);
PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
#undef PUT_SINFO
+#undef PUT_SINFO_U64
if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) {
struct nlattr *tidsattr;
@@ -3845,19 +3885,19 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
if (!tidattr)
goto nla_put_failure;
-#define PUT_TIDVAL(attr, memb, type) do { \
+#define PUT_TIDVAL_U64(attr, memb) do { \
if (tidstats->filled & BIT(NL80211_TID_STATS_ ## attr) && \
- nla_put_ ## type(msg, NL80211_TID_STATS_ ## attr, \
- tidstats->memb)) \
+ nla_put_u64_64bit(msg, NL80211_TID_STATS_ ## attr, \
+ tidstats->memb, NL80211_TID_STATS_PAD)) \
goto nla_put_failure; \
} while (0)
- PUT_TIDVAL(RX_MSDU, rx_msdu, u64);
- PUT_TIDVAL(TX_MSDU, tx_msdu, u64);
- PUT_TIDVAL(TX_MSDU_RETRIES, tx_msdu_retries, u64);
- PUT_TIDVAL(TX_MSDU_FAILED, tx_msdu_failed, u64);
+ PUT_TIDVAL_U64(RX_MSDU, rx_msdu);
+ PUT_TIDVAL_U64(TX_MSDU, tx_msdu);
+ PUT_TIDVAL_U64(TX_MSDU_RETRIES, tx_msdu_retries);
+ PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);
-#undef PUT_TIDVAL
+#undef PUT_TIDVAL_U64
nla_nest_end(msg, tidattr);
}
@@ -3977,6 +4017,10 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
statype != CFG80211_STA_AP_CLIENT_UNASSOC)
return -EINVAL;
+ if (params->support_p2p_ps != -1 &&
+ statype != CFG80211_STA_AP_CLIENT_UNASSOC)
+ return -EINVAL;
+
if (params->aid &&
!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) &&
statype != CFG80211_STA_AP_CLIENT_UNASSOC)
@@ -4270,6 +4314,18 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
else
params.listen_interval = -1;
+ if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
+ u8 tmp;
+
+ tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
+ if (tmp >= NUM_NL80211_P2P_PS_STATUS)
+ return -EINVAL;
+
+ params.support_p2p_ps = tmp;
+ } else {
+ params.support_p2p_ps = -1;
+ }
+
if (!info->attrs[NL80211_ATTR_MAC])
return -EINVAL;
@@ -4393,6 +4449,23 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
params.listen_interval =
nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
+ if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
+ u8 tmp;
+
+ tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
+ if (tmp >= NUM_NL80211_P2P_PS_STATUS)
+ return -EINVAL;
+
+ params.support_p2p_ps = tmp;
+ } else {
+ /*
+ * if not specified, assume it's supported for P2P GO interface,
+ * and is NOT supported for AP interface
+ */
+ params.support_p2p_ps =
+ dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO;
+ }
+
if (info->attrs[NL80211_ATTR_PEER_AID])
params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
else
@@ -5758,6 +5831,73 @@ static int validate_scan_freqs(struct nlattr *freqs)
return n_channels;
}
+static bool is_band_valid(struct wiphy *wiphy, enum nl80211_band b)
+{
+ return b < NUM_NL80211_BANDS && wiphy->bands[b];
+}
+
+static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
+ struct cfg80211_bss_selection *bss_select)
+{
+ struct nlattr *attr[NL80211_BSS_SELECT_ATTR_MAX + 1];
+ struct nlattr *nest;
+ int err;
+ bool found = false;
+ int i;
+
+ /* only process one nested attribute */
+ nest = nla_data(nla);
+ if (!nla_ok(nest, nla_len(nest)))
+ return -EINVAL;
+
+ err = nla_parse(attr, NL80211_BSS_SELECT_ATTR_MAX, nla_data(nest),
+ nla_len(nest), nl80211_bss_select_policy);
+ if (err)
+ return err;
+
+ /* only one attribute may be given */
+ for (i = 0; i <= NL80211_BSS_SELECT_ATTR_MAX; i++) {
+ if (attr[i]) {
+ if (found)
+ return -EINVAL;
+ found = true;
+ }
+ }
+
+ bss_select->behaviour = __NL80211_BSS_SELECT_ATTR_INVALID;
+
+ if (attr[NL80211_BSS_SELECT_ATTR_RSSI])
+ bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI;
+
+ if (attr[NL80211_BSS_SELECT_ATTR_BAND_PREF]) {
+ bss_select->behaviour = NL80211_BSS_SELECT_ATTR_BAND_PREF;
+ bss_select->param.band_pref =
+ nla_get_u32(attr[NL80211_BSS_SELECT_ATTR_BAND_PREF]);
+ if (!is_band_valid(wiphy, bss_select->param.band_pref))
+ return -EINVAL;
+ }
+
+ if (attr[NL80211_BSS_SELECT_ATTR_RSSI_ADJUST]) {
+ struct nl80211_bss_select_rssi_adjust *adj_param;
+
+ adj_param = nla_data(attr[NL80211_BSS_SELECT_ATTR_RSSI_ADJUST]);
+ bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI_ADJUST;
+ bss_select->param.adjust.band = adj_param->band;
+ bss_select->param.adjust.delta = adj_param->delta;
+ if (!is_band_valid(wiphy, bss_select->param.adjust.band))
+ return -EINVAL;
+ }
+
+ /* user-space did not provide behaviour attribute */
+ if (bss_select->behaviour == __NL80211_BSS_SELECT_ATTR_INVALID)
+ return -EINVAL;
+
+ if (!(wiphy->bss_select_support & BIT(bss_select->behaviour)))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nl80211_parse_random_mac(struct nlattr **attrs,
u8 *mac_addr, u8 *mac_addr_mask)
{
@@ -5888,10 +6028,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
i++;
}
} else {
- enum ieee80211_band band;
+ enum nl80211_band band;
/* all channels */
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
int j;
if (!wiphy->bands[band])
continue;
@@ -5936,7 +6076,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
request->ie_len);
}
- for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+ for (i = 0; i < NUM_NL80211_BANDS; i++)
if (wiphy->bands[i])
request->rates[i] =
(1 << wiphy->bands[i]->n_bitrates) - 1;
@@ -5945,9 +6085,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
nla_for_each_nested(attr,
info->attrs[NL80211_ATTR_SCAN_SUPP_RATES],
tmp) {
- enum ieee80211_band band = nla_type(attr);
+ enum nl80211_band band = nla_type(attr);
- if (band < 0 || band >= IEEE80211_NUM_BANDS) {
+ if (band < 0 || band >= NUM_NL80211_BANDS) {
err = -EINVAL;
goto out_free;
}
@@ -5996,6 +6136,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
request->no_cck =
nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
+ if (info->attrs[NL80211_ATTR_MAC])
+ memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]),
+ ETH_ALEN);
+ else
+ eth_broadcast_addr(request->bssid);
+
request->wdev = wdev;
request->wiphy = &rdev->wiphy;
request->scan_start = jiffies;
@@ -6129,7 +6275,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
struct cfg80211_sched_scan_request *request;
struct nlattr *attr;
int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i, n_plans = 0;
- enum ieee80211_band band;
+ enum nl80211_band band;
size_t ie_len;
struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF;
@@ -6294,7 +6440,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
}
} else {
/* all channels */
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
int j;
if (!wiphy->bands[band])
continue;
@@ -6738,7 +6884,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
if (wdev->netdev &&
nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex))
goto nla_put_failure;
- if (nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
goto nla_put_failure;
bss = nla_nest_start(msg, NL80211_ATTR_BSS);
@@ -6759,7 +6906,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
*/
ies = rcu_dereference(res->ies);
if (ies) {
- if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf))
+ if (nla_put_u64_64bit(msg, NL80211_BSS_TSF, ies->tsf,
+ NL80211_BSS_PAD))
goto fail_unlock_rcu;
if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS,
ies->len, ies->data))
@@ -6769,7 +6917,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
/* and this pointer is always (unless driver didn't know) beacon data */
ies = rcu_dereference(res->beacon_ies);
if (ies && ies->from_beacon) {
- if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf))
+ if (nla_put_u64_64bit(msg, NL80211_BSS_BEACON_TSF, ies->tsf,
+ NL80211_BSS_PAD))
goto fail_unlock_rcu;
if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES,
ies->len, ies->data))
@@ -6788,8 +6937,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
goto nla_put_failure;
if (intbss->ts_boottime &&
- nla_put_u64(msg, NL80211_BSS_LAST_SEEN_BOOTTIME,
- intbss->ts_boottime))
+ nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME,
+ intbss->ts_boottime, NL80211_BSS_PAD))
goto nla_put_failure;
switch (rdev->wiphy.signal_type) {
@@ -6909,28 +7058,28 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
nla_put_flag(msg, NL80211_SURVEY_INFO_IN_USE))
goto nla_put_failure;
if ((survey->filled & SURVEY_INFO_TIME) &&
- nla_put_u64(msg, NL80211_SURVEY_INFO_TIME,
- survey->time))
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME,
+ survey->time, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
if ((survey->filled & SURVEY_INFO_TIME_BUSY) &&
- nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_BUSY,
- survey->time_busy))
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BUSY,
+ survey->time_busy, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
if ((survey->filled & SURVEY_INFO_TIME_EXT_BUSY) &&
- nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_EXT_BUSY,
- survey->time_ext_busy))
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_EXT_BUSY,
+ survey->time_ext_busy, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
if ((survey->filled & SURVEY_INFO_TIME_RX) &&
- nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_RX,
- survey->time_rx))
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_RX,
+ survey->time_rx, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
if ((survey->filled & SURVEY_INFO_TIME_TX) &&
- nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_TX,
- survey->time_tx))
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_TX,
+ survey->time_tx, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
if ((survey->filled & SURVEY_INFO_TIME_SCAN) &&
- nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_SCAN,
- survey->time_scan))
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_SCAN,
+ survey->time_scan, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
nla_nest_end(msg, infoattr);
@@ -7402,14 +7551,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
static bool
nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev,
- int mcast_rate[IEEE80211_NUM_BANDS],
+ int mcast_rate[NUM_NL80211_BANDS],
int rateval)
{
struct wiphy *wiphy = &rdev->wiphy;
bool found = false;
int band, i;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
sband = wiphy->bands[band];
@@ -7589,7 +7738,7 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
- int mcast_rate[IEEE80211_NUM_BANDS];
+ int mcast_rate[NUM_NL80211_BANDS];
u32 nla_rate;
int err;
@@ -7650,8 +7799,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
}
if (wdev) {
- if (nla_put_u64(skb, NL80211_ATTR_WDEV,
- wdev_id(wdev)))
+ if (nla_put_u64_64bit(skb, NL80211_ATTR_WDEV,
+ wdev_id(wdev), NL80211_ATTR_PAD))
goto nla_put_failure;
if (wdev->netdev &&
nla_put_u32(skb, NL80211_ATTR_IFINDEX,
@@ -7922,6 +8071,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
connect.mfp = NL80211_MFP_NO;
}
+ if (info->attrs[NL80211_ATTR_PREV_BSSID])
+ connect.prev_bssid =
+ nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
+
if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
connect.channel = nl80211_get_valid_chan(
wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]);
@@ -7990,13 +8143,29 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
}
connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
- if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) {
+ if (connect.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) {
kzfree(connkeys);
return -EOPNOTSUPP;
}
+ if (info->attrs[NL80211_ATTR_BSS_SELECT]) {
+ /* bss selection makes no sense if bssid is set */
+ if (connect.bssid) {
+ kzfree(connkeys);
+ return -EINVAL;
+ }
+
+ err = parse_bss_select(info->attrs[NL80211_ATTR_BSS_SELECT],
+ wiphy, &connect.bss_select);
+ if (err) {
+ kzfree(connkeys);
+ return err;
+ }
+ }
+
wdev_lock(dev->ieee80211_ptr);
- err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
+ err = cfg80211_connect(rdev, dev, &connect, connkeys,
+ connect.prev_bssid);
wdev_unlock(dev->ieee80211_ptr);
if (err)
kzfree(connkeys);
@@ -8224,7 +8393,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
if (err)
goto free_msg;
- if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -8394,7 +8564,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
memset(&mask, 0, sizeof(mask));
/* Default to all rates enabled */
- for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
sband = rdev->wiphy.bands[i];
if (!sband)
@@ -8418,14 +8588,14 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
/*
* The nested attribute uses enum nl80211_band as the index. This maps
- * directly to the enum ieee80211_band values used in cfg80211.
+ * directly to the enum nl80211_band values used in cfg80211.
*/
BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
- enum ieee80211_band band = nla_type(tx_rates);
+ enum nl80211_band band = nla_type(tx_rates);
int err;
- if (band < 0 || band >= IEEE80211_NUM_BANDS)
+ if (band < 0 || band >= NUM_NL80211_BANDS)
return -EINVAL;
sband = rdev->wiphy.bands[band];
if (sband == NULL)
@@ -8636,7 +8806,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
goto free_msg;
if (msg) {
- if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -9922,7 +10093,8 @@ static int nl80211_probe_client(struct sk_buff *skb,
if (err)
goto free_msg;
- if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -10220,7 +10392,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
*wdev = NULL;
if (cb->args[1]) {
- list_for_each_entry(tmp, &(*rdev)->wdev_list, list) {
+ list_for_each_entry(tmp, &wiphy->wdev_list, list) {
if (tmp->identifier == cb->args[1] - 1) {
*wdev = tmp;
break;
@@ -10347,8 +10519,9 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
break;
if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
- (wdev && nla_put_u64(skb, NL80211_ATTR_WDEV,
- wdev_id(wdev)))) {
+ (wdev && nla_put_u64_64bit(skb, NL80211_ATTR_WDEV,
+ wdev_id(wdev),
+ NL80211_ATTR_PAD))) {
genlmsg_cancel(skb, hdr);
break;
}
@@ -10590,7 +10763,7 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb,
* section 10.22.6.2.1. Disallow 5/10Mhz channels as well for now, the
* specification is not defined for them.
*/
- if (chandef.chan->band == IEEE80211_BAND_2GHZ &&
+ if (chandef.chan->band == NL80211_BAND_2GHZ &&
chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
chandef.width != NL80211_CHAN_WIDTH_20)
return -EINVAL;
@@ -11555,7 +11728,8 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
(wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
wdev->netdev->ifindex)) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
goto nla_put_failure;
/* ignore errors and send incomplete event anyway */
@@ -12222,11 +12396,13 @@ static void nl80211_send_remain_on_chan_event(
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
(wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
wdev->netdev->ifindex)) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD) ||
nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq) ||
nla_put_u32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE,
NL80211_CHAN_NO_HT) ||
- nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD))
goto nla_put_failure;
if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL &&
@@ -12460,7 +12636,8 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
(netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
netdev->ifindex)) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD) ||
nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) ||
(sig_dbm &&
nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
@@ -12503,9 +12680,11 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
(netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
netdev->ifindex)) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD) ||
nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
- nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD) ||
(ack && nla_put_flag(msg, NL80211_ATTR_ACK)))
goto nla_put_failure;
@@ -12885,7 +13064,8 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev = netdev->ieee80211_ptr;
if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
goto nla_put_failure;
}
@@ -12930,7 +13110,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
- nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD) ||
(acked && nla_put_flag(msg, NL80211_ATTR_ACK)))
goto nla_put_failure;
@@ -13075,7 +13256,8 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
goto free_msg;
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
goto free_msg;
if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
@@ -13231,7 +13413,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
sched_scan_req->owner_nlportid == notify->portid)
schedule_scan_stop = true;
- list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) {
cfg80211_mlme_unregister_socket(wdev, notify->portid);
if (wdev->owner_nlportid == notify->portid)
@@ -13350,7 +13532,8 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp)
goto nla_put_failure;
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -13383,7 +13566,8 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) ||
- nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
goto out;
genlmsg_end(msg, hdr);
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 8ae0c04f9..85ff30bee 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1048,7 +1048,7 @@ rdev_start_radar_detection(struct cfg80211_registered_device *rdev,
static inline int
rdev_set_mcast_rate(struct cfg80211_registered_device *rdev,
struct net_device *dev,
- int mcast_rate[IEEE80211_NUM_BANDS])
+ int mcast_rate[NUM_NL80211_BANDS])
{
int ret = -ENOTSUPP;
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index c5fb317ee..5dbac3749 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1546,12 +1546,12 @@ static void reg_process_ht_flags_band(struct wiphy *wiphy,
static void reg_process_ht_flags(struct wiphy *wiphy)
{
- enum ieee80211_band band;
+ enum nl80211_band band;
if (!wiphy)
return;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ for (band = 0; band < NUM_NL80211_BANDS; band++)
reg_process_ht_flags_band(wiphy, wiphy->bands[band]);
}
@@ -1639,7 +1639,7 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy)
ASSERT_RTNL();
- list_for_each_entry(wdev, &rdev->wdev_list, list)
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
if (!reg_wdev_chan_valid(wiphy, wdev))
cfg80211_leave(rdev, wdev);
}
@@ -1673,7 +1673,7 @@ static void reg_check_channels(void)
static void wiphy_update_regulatory(struct wiphy *wiphy,
enum nl80211_reg_initiator initiator)
{
- enum ieee80211_band band;
+ enum nl80211_band band;
struct regulatory_request *lr = get_last_request();
if (ignore_reg_update(wiphy, initiator)) {
@@ -1690,7 +1690,7 @@ static void wiphy_update_regulatory(struct wiphy *wiphy,
lr->dfs_region = get_cfg80211_regdom()->dfs_region;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ for (band = 0; band < NUM_NL80211_BANDS; band++)
handle_band(wiphy, initiator, wiphy->bands[band]);
reg_process_beacons(wiphy);
@@ -1786,14 +1786,14 @@ static void handle_band_custom(struct wiphy *wiphy,
void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
const struct ieee80211_regdomain *regd)
{
- enum ieee80211_band band;
+ enum nl80211_band band;
unsigned int bands_set = 0;
WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG),
"wiphy should have REGULATORY_CUSTOM_REG\n");
wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
if (!wiphy->bands[band])
continue;
handle_band_custom(wiphy, wiphy->bands[band], regd);
@@ -2228,7 +2228,7 @@ static void reg_process_self_managed_hints(void)
struct wiphy *wiphy;
const struct ieee80211_regdomain *tmp;
const struct ieee80211_regdomain *regd;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct regulatory_request request = {};
list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
@@ -2246,7 +2246,7 @@ static void reg_process_self_managed_hints(void)
rcu_assign_pointer(wiphy->regd, regd);
rcu_free_regdom(tmp);
- for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ for (band = 0; band < NUM_NL80211_BANDS; band++)
handle_band_custom(wiphy, wiphy->bands[band], regd);
reg_process_ht_flags(wiphy);
@@ -2404,7 +2404,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
}
EXPORT_SYMBOL(regulatory_hint);
-void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band,
+void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band,
const u8 *country_ie, u8 country_ie_len)
{
char alpha2[2];
@@ -2504,11 +2504,11 @@ static void restore_alpha2(char *alpha2, bool reset_user)
static void restore_custom_reg_settings(struct wiphy *wiphy)
{
struct ieee80211_supported_band *sband;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_channel *chan;
int i;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
sband = wiphy->bands[band];
if (!sband)
continue;
@@ -2623,9 +2623,9 @@ void regulatory_hint_disconnect(void)
static bool freq_is_chan_12_13_14(u16 freq)
{
- if (freq == ieee80211_channel_to_frequency(12, IEEE80211_BAND_2GHZ) ||
- freq == ieee80211_channel_to_frequency(13, IEEE80211_BAND_2GHZ) ||
- freq == ieee80211_channel_to_frequency(14, IEEE80211_BAND_2GHZ))
+ if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) ||
+ freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) ||
+ freq == ieee80211_channel_to_frequency(14, NL80211_BAND_2GHZ))
return true;
return false;
}
@@ -2650,7 +2650,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
if (beacon_chan->beacon_found ||
beacon_chan->flags & IEEE80211_CHAN_RADAR ||
- (beacon_chan->band == IEEE80211_BAND_2GHZ &&
+ (beacon_chan->band == NL80211_BAND_2GHZ &&
!freq_is_chan_12_13_14(beacon_chan->center_freq)))
return 0;
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index 9f495d76e..f6ced316b 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -104,7 +104,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
* information for a band the BSS is not present in it will be ignored.
*/
void regulatory_hint_country_ie(struct wiphy *wiphy,
- enum ieee80211_band band,
+ enum nl80211_band band,
const u8 *country_ie,
u8 country_ie_len);
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 14d5369eb..ef2955c89 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -364,13 +364,16 @@ const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
}
EXPORT_SYMBOL(cfg80211_find_ie);
-const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type,
+const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
const u8 *ies, int len)
{
struct ieee80211_vendor_ie *ie;
const u8 *pos = ies, *end = ies + len;
int ie_oui;
+ if (WARN_ON(oui_type > 0xff))
+ return NULL;
+
while (pos < end) {
pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos,
end - pos);
@@ -386,7 +389,8 @@ const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type,
goto cont;
ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2];
- if (ie_oui == oui && ie->oui_type == oui_type)
+ if (ie_oui == oui &&
+ (oui_type < 0 || ie->oui_type == oui_type))
return pos;
cont:
pos += 2 + ie->len;
@@ -531,7 +535,7 @@ static int cmp_bss(struct cfg80211_bss *a,
}
static bool cfg80211_bss_type_match(u16 capability,
- enum ieee80211_band band,
+ enum nl80211_band band,
enum ieee80211_bss_type bss_type)
{
bool ret = true;
@@ -540,7 +544,7 @@ static bool cfg80211_bss_type_match(u16 capability,
if (bss_type == IEEE80211_BSS_TYPE_ANY)
return ret;
- if (band == IEEE80211_BAND_60GHZ) {
+ if (band == NL80211_BAND_60GHZ) {
mask = WLAN_CAPABILITY_DMG_TYPE_MASK;
switch (bss_type) {
case IEEE80211_BSS_TYPE_ESS:
@@ -1006,7 +1010,7 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
if (!res)
return NULL;
- if (channel->band == IEEE80211_BAND_60GHZ) {
+ if (channel->band == NL80211_BAND_60GHZ) {
bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK;
if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP ||
bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS)
@@ -1089,7 +1093,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
if (!res)
return NULL;
- if (channel->band == IEEE80211_BAND_60GHZ) {
+ if (channel->band == NL80211_BAND_60GHZ) {
bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK;
if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP ||
bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS)
@@ -1185,7 +1189,7 @@ int cfg80211_wext_siwscan(struct net_device *dev,
struct iw_scan_req *wreq = NULL;
struct cfg80211_scan_request *creq = NULL;
int i, err, n_channels = 0;
- enum ieee80211_band band;
+ enum nl80211_band band;
if (!netif_running(dev))
return -ENETDOWN;
@@ -1229,7 +1233,7 @@ int cfg80211_wext_siwscan(struct net_device *dev,
/* translate "Scan on frequencies" request */
i = 0;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
int j;
if (!wiphy->bands[band])
@@ -1289,10 +1293,12 @@ int cfg80211_wext_siwscan(struct net_device *dev,
creq->n_ssids = 0;
}
- for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+ for (i = 0; i < NUM_NL80211_BANDS; i++)
if (wiphy->bands[i])
creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1;
+ eth_broadcast_addr(creq->bssid);
+
rdev->scan_req = creq;
err = rdev_scan(rdev, creq);
if (err) {
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 544558171..584fdc347 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -81,7 +81,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
return -ENOMEM;
if (wdev->conn->params.channel) {
- enum ieee80211_band band = wdev->conn->params.channel->band;
+ enum nl80211_band band = wdev->conn->params.channel->band;
struct ieee80211_supported_band *sband =
wdev->wiphy->bands[band];
@@ -93,11 +93,11 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
request->rates[band] = (1 << sband->n_bitrates) - 1;
} else {
int i = 0, j;
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_supported_band *bands;
struct ieee80211_channel *channel;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
bands = wdev->wiphy->bands[band];
if (!bands)
continue;
@@ -119,6 +119,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
wdev->conn->params.ssid_len);
request->ssids[0].ssid_len = wdev->conn->params.ssid_len;
+ eth_broadcast_addr(request->bssid);
+
request->wdev = wdev;
request->wiphy = &rdev->wiphy;
request->scan_start = jiffies;
@@ -221,7 +223,7 @@ void cfg80211_conn_work(struct work_struct *work)
rtnl_lock();
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (!wdev->netdev)
continue;
@@ -490,8 +492,18 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
if (!rdev->ops->auth || !rdev->ops->assoc)
return -EOPNOTSUPP;
- if (wdev->current_bss)
- return -EALREADY;
+ if (wdev->current_bss) {
+ if (!prev_bssid)
+ return -EALREADY;
+ if (prev_bssid &&
+ !ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid))
+ return -ENOTCONN;
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ wdev->current_bss = NULL;
+
+ cfg80211_sme_free(wdev);
+ }
if (WARN_ON(wdev->conn))
return -EINPROGRESS;
@@ -605,7 +617,7 @@ static bool cfg80211_is_all_idle(void)
* count as new regulatory hints.
*/
list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
wdev_lock(wdev);
if (wdev->conn || wdev->current_bss)
is_all_idle = false;
@@ -741,19 +753,32 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
kfree(country_ie);
}
-void cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
- const u8 *req_ie, size_t req_ie_len,
- const u8 *resp_ie, size_t resp_ie_len,
- u16 status, gfp_t gfp)
+/* Consumes bss object one way or another */
+void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
+ struct cfg80211_bss *bss, const u8 *req_ie,
+ size_t req_ie_len, const u8 *resp_ie,
+ size_t resp_ie_len, u16 status, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_event *ev;
unsigned long flags;
+ if (bss) {
+ /* Make sure the bss entry provided by the driver is valid. */
+ struct cfg80211_internal_bss *ibss = bss_from_pub(bss);
+
+ if (WARN_ON(list_empty(&ibss->list))) {
+ cfg80211_put_bss(wdev->wiphy, bss);
+ return;
+ }
+ }
+
ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp);
- if (!ev)
+ if (!ev) {
+ cfg80211_put_bss(wdev->wiphy, bss);
return;
+ }
ev->type = EVENT_CONNECT_RESULT;
if (bssid)
@@ -768,6 +793,9 @@ void cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
ev->cr.resp_ie_len = resp_ie_len;
memcpy((void *)ev->cr.resp_ie, resp_ie, resp_ie_len);
}
+ if (bss)
+ cfg80211_hold_bss(bss_from_pub(bss));
+ ev->cr.bss = bss;
ev->cr.status = status;
spin_lock_irqsave(&wdev->event_lock, flags);
@@ -775,7 +803,7 @@ void cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
spin_unlock_irqrestore(&wdev->event_lock, flags);
queue_work(cfg80211_wq, &rdev->event_work);
}
-EXPORT_SYMBOL(cfg80211_connect_result);
+EXPORT_SYMBOL(cfg80211_connect_bss);
/* Consumes bss object one way or another */
void __cfg80211_roamed(struct wireless_dev *wdev,
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 9cee02206..e46469bc1 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -91,7 +91,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev)
{
struct wireless_dev *wdev;
- list_for_each_entry(wdev, &rdev->wdev_list, list)
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
cfg80211_leave(rdev, wdev);
}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 09b242b09..3c1091ae6 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -110,7 +110,7 @@
conf->dot11MeshHWMPconfirmationInterval; \
} while (0)
-#define CHAN_ENTRY __field(enum ieee80211_band, band) \
+#define CHAN_ENTRY __field(enum nl80211_band, band) \
__field(u16, center_freq)
#define CHAN_ASSIGN(chan) \
do { \
@@ -125,7 +125,7 @@
#define CHAN_PR_FMT "band: %d, freq: %u"
#define CHAN_PR_ARG __entry->band, __entry->center_freq
-#define CHAN_DEF_ENTRY __field(enum ieee80211_band, band) \
+#define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \
__field(u32, control_freq) \
__field(u32, width) \
__field(u32, center_freq1) \
@@ -1259,6 +1259,7 @@ TRACE_EVENT(rdev_connect,
__field(bool, privacy)
__field(u32, wpa_versions)
__field(u32, flags)
+ MAC_ENTRY(prev_bssid)
),
TP_fast_assign(
WIPHY_ASSIGN;
@@ -1270,13 +1271,14 @@ TRACE_EVENT(rdev_connect,
__entry->privacy = sme->privacy;
__entry->wpa_versions = sme->crypto.wpa_versions;
__entry->flags = sme->flags;
+ MAC_ASSIGN(prev_bssid, sme->prev_bssid);
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, "
- "flags: %u",
+ "flags: %u, previous bssid: " MAC_PR_FMT,
WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid,
__entry->auth_type, BOOL_TO_STR(__entry->privacy),
- __entry->wpa_versions, __entry->flags)
+ __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid))
);
TRACE_EVENT(rdev_set_cqm_rssi_config,
@@ -2645,7 +2647,7 @@ TRACE_EVENT(cfg80211_scan_done,
TP_STRUCT__entry(
__field(u32, n_channels)
__dynamic_array(u8, ie, request ? request->ie_len : 0)
- __array(u32, rates, IEEE80211_NUM_BANDS)
+ __array(u32, rates, NUM_NL80211_BANDS)
__field(u32, wdev_id)
MAC_ENTRY(wiphy_mac)
__field(bool, no_cck)
@@ -2656,7 +2658,7 @@ TRACE_EVENT(cfg80211_scan_done,
memcpy(__get_dynamic_array(ie), request->ie,
request->ie_len);
memcpy(__entry->rates, request->rates,
- IEEE80211_NUM_BANDS);
+ NUM_NL80211_BANDS);
__entry->wdev_id = request->wdev ?
request->wdev->identifier : 0;
if (request->wiphy)
@@ -2881,25 +2883,25 @@ TRACE_EVENT(rdev_start_radar_detection,
TRACE_EVENT(rdev_set_mcast_rate,
TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
- int mcast_rate[IEEE80211_NUM_BANDS]),
+ int mcast_rate[NUM_NL80211_BANDS]),
TP_ARGS(wiphy, netdev, mcast_rate),
TP_STRUCT__entry(
WIPHY_ENTRY
NETDEV_ENTRY
- __array(int, mcast_rate, IEEE80211_NUM_BANDS)
+ __array(int, mcast_rate, NUM_NL80211_BANDS)
),
TP_fast_assign(
WIPHY_ASSIGN;
NETDEV_ASSIGN;
memcpy(__entry->mcast_rate, mcast_rate,
- sizeof(int) * IEEE80211_NUM_BANDS);
+ sizeof(int) * NUM_NL80211_BANDS);
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
"mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
WIPHY_PR_ARG, NETDEV_PR_ARG,
- __entry->mcast_rate[IEEE80211_BAND_2GHZ],
- __entry->mcast_rate[IEEE80211_BAND_5GHZ],
- __entry->mcast_rate[IEEE80211_BAND_60GHZ])
+ __entry->mcast_rate[NL80211_BAND_2GHZ],
+ __entry->mcast_rate[NL80211_BAND_5GHZ],
+ __entry->mcast_rate[NL80211_BAND_60GHZ])
);
TRACE_EVENT(rdev_set_coalesce,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 47b917841..b7d1592bd 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -47,7 +47,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
if (WARN_ON(!sband))
return 1;
- if (sband->band == IEEE80211_BAND_2GHZ) {
+ if (sband->band == NL80211_BAND_2GHZ) {
if (scan_width == NL80211_BSS_CHAN_WIDTH_5 ||
scan_width == NL80211_BSS_CHAN_WIDTH_10)
mandatory_flag = IEEE80211_RATE_MANDATORY_G;
@@ -65,26 +65,26 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
}
EXPORT_SYMBOL(ieee80211_mandatory_rates);
-int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band)
+int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
{
/* see 802.11 17.3.8.3.2 and Annex J
* there are overlapping channel numbers in 5GHz and 2GHz bands */
if (chan <= 0)
return 0; /* not supported */
switch (band) {
- case IEEE80211_BAND_2GHZ:
+ case NL80211_BAND_2GHZ:
if (chan == 14)
return 2484;
else if (chan < 14)
return 2407 + chan * 5;
break;
- case IEEE80211_BAND_5GHZ:
+ case NL80211_BAND_5GHZ:
if (chan >= 182 && chan <= 196)
return 4000 + chan * 5;
else
return 5000 + chan * 5;
break;
- case IEEE80211_BAND_60GHZ:
+ case NL80211_BAND_60GHZ:
if (chan < 5)
return 56160 + chan * 2160;
break;
@@ -116,11 +116,11 @@ EXPORT_SYMBOL(ieee80211_frequency_to_channel);
struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
int freq)
{
- enum ieee80211_band band;
+ enum nl80211_band band;
struct ieee80211_supported_band *sband;
int i;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
sband = wiphy->bands[band];
if (!sband)
@@ -137,12 +137,12 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
EXPORT_SYMBOL(__ieee80211_get_channel);
static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
- enum ieee80211_band band)
+ enum nl80211_band band)
{
int i, want;
switch (band) {
- case IEEE80211_BAND_5GHZ:
+ case NL80211_BAND_5GHZ:
want = 3;
for (i = 0; i < sband->n_bitrates; i++) {
if (sband->bitrates[i].bitrate == 60 ||
@@ -155,7 +155,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
}
WARN_ON(want);
break;
- case IEEE80211_BAND_2GHZ:
+ case NL80211_BAND_2GHZ:
want = 7;
for (i = 0; i < sband->n_bitrates; i++) {
if (sband->bitrates[i].bitrate == 10) {
@@ -185,12 +185,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
}
WARN_ON(want != 0 && want != 3 && want != 6);
break;
- case IEEE80211_BAND_60GHZ:
+ case NL80211_BAND_60GHZ:
/* check for mandatory HT MCS 1..4 */
WARN_ON(!sband->ht_cap.ht_supported);
WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
break;
- case IEEE80211_NUM_BANDS:
+ case NUM_NL80211_BANDS:
WARN_ON(1);
break;
}
@@ -198,9 +198,9 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
{
- enum ieee80211_band band;
+ enum nl80211_band band;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ for (band = 0; band < NUM_NL80211_BANDS; band++)
if (wiphy->bands[band])
set_mandatory_flags_band(wiphy->bands[band], band);
}
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
struct skb_shared_info *sh = skb_shinfo(skb);
int page_offset;
- atomic_inc(&page->_count);
+ page_ref_inc(page);
page_offset = ptr - page_address(page);
skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
}
@@ -721,6 +721,8 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
* alignment since sizeof(struct ethhdr) is 14.
*/
frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len);
+ if (!frame)
+ return NULL;
skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len);
@@ -950,7 +952,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
ev->cr.resp_ie, ev->cr.resp_ie_len,
ev->cr.status,
ev->cr.status == WLAN_STATUS_SUCCESS,
- NULL);
+ ev->cr.bss);
break;
case EVENT_ROAMED:
__cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
@@ -986,7 +988,7 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev)
ASSERT_RTNL();
- list_for_each_entry(wdev, &rdev->wdev_list, list)
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
cfg80211_process_wdev_events(wdev);
}
@@ -1399,22 +1401,22 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
EXPORT_SYMBOL(ieee80211_ie_split_ric);
bool ieee80211_operating_class_to_band(u8 operating_class,
- enum ieee80211_band *band)
+ enum nl80211_band *band)
{
switch (operating_class) {
case 112:
case 115 ... 127:
case 128 ... 130:
- *band = IEEE80211_BAND_5GHZ;
+ *band = NL80211_BAND_5GHZ;
return true;
case 81:
case 82:
case 83:
case 84:
- *band = IEEE80211_BAND_2GHZ;
+ *band = NL80211_BAND_2GHZ;
return true;
case 180:
- *band = IEEE80211_BAND_60GHZ;
+ *band = NL80211_BAND_60GHZ;
return true;
}
@@ -1560,7 +1562,7 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
if (!beacon_int)
return -EINVAL;
- list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (!wdev->beacon_interval)
continue;
if (wdev->beacon_interval != beacon_int) {
@@ -1726,10 +1728,10 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy)
{
- enum ieee80211_band band;
+ enum nl80211_band band;
unsigned int n_channels = 0;
- for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ for (band = 0; band < NUM_NL80211_BANDS; band++)
if (wiphy->bands[band])
n_channels += wiphy->bands[band]->n_channels;
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index fd682832a..9f27221c8 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -25,42 +25,7 @@ int cfg80211_wext_giwname(struct net_device *dev,
struct iw_request_info *info,
char *name, char *extra)
{
- struct wireless_dev *wdev = dev->ieee80211_ptr;
- struct ieee80211_supported_band *sband;
- bool is_ht = false, is_a = false, is_b = false, is_g = false;
-
- if (!wdev)
- return -EOPNOTSUPP;
-
- sband = wdev->wiphy->bands[IEEE80211_BAND_5GHZ];
- if (sband) {
- is_a = true;
- is_ht |= sband->ht_cap.ht_supported;
- }
-
- sband = wdev->wiphy->bands[IEEE80211_BAND_2GHZ];
- if (sband) {
- int i;
- /* Check for mandatory rates */
- for (i = 0; i < sband->n_bitrates; i++) {
- if (sband->bitrates[i].bitrate == 10)
- is_b = true;
- if (sband->bitrates[i].bitrate == 60)
- is_g = true;
- }
- is_ht |= sband->ht_cap.ht_supported;
- }
-
strcpy(name, "IEEE 802.11");
- if (is_a)
- strcat(name, "a");
- if (is_b)
- strcat(name, "b");
- if (is_g)
- strcat(name, "g");
- if (is_ht)
- strcat(name, "n");
-
return 0;
}
EXPORT_WEXT_HANDLER(cfg80211_wext_giwname);
@@ -143,7 +108,7 @@ int cfg80211_wext_giwrange(struct net_device *dev,
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct iw_range *range = (struct iw_range *) extra;
- enum ieee80211_band band;
+ enum nl80211_band band;
int i, c = 0;
if (!wdev)
@@ -215,7 +180,7 @@ int cfg80211_wext_giwrange(struct net_device *dev,
}
}
- for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band ++) {
struct ieee80211_supported_band *sband;
sband = wdev->wiphy->bands[band];
@@ -265,11 +230,11 @@ int cfg80211_wext_freq(struct iw_freq *freq)
* -EINVAL for impossible things.
*/
if (freq->e == 0) {
- enum ieee80211_band band = IEEE80211_BAND_2GHZ;
+ enum nl80211_band band = NL80211_BAND_2GHZ;
if (freq->m < 0)
return 0;
if (freq->m > 14)
- band = IEEE80211_BAND_5GHZ;
+ band = NL80211_BAND_5GHZ;
return ieee80211_channel_to_frequency(freq->m, band);
} else {
int i, div = 1000000;
@@ -1245,7 +1210,7 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
maxrate = rate->value / 100000;
}
- for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
sband = wdev->wiphy->bands[band];
if (sband == NULL)
continue;
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index c753211cb..dbb2738e3 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -399,7 +399,10 @@ static int __init wireless_nlevent_init(void)
if (err)
return err;
- return register_netdevice_notifier(&wext_netdev_notifier);
+ err = register_netdevice_notifier(&wext_netdev_notifier);
+ if (err)
+ unregister_pernet_subsys(&wext_pernet_ops);
+ return err;
}
subsys_initcall(wireless_nlevent_init);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 2cc7af858..d516845e1 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -809,7 +809,8 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
goto out;
}
if (x->lastused) {
- ret = nla_put_u64(skb, XFRMA_LASTUSED, x->lastused);
+ ret = nla_put_u64_64bit(skb, XFRMA_LASTUSED, x->lastused,
+ XFRMA_PAD);
if (ret)
goto out;
}
@@ -1813,7 +1814,7 @@ static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x)
return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
+ nla_total_size(replay_size)
- + nla_total_size(sizeof(struct xfrm_lifetime_cur))
+ + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur))
+ nla_total_size(sizeof(struct xfrm_mark))
+ nla_total_size(4) /* XFRM_AE_RTHR */
+ nla_total_size(4); /* XFRM_AE_ETHR */
@@ -1848,7 +1849,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
}
if (err)
goto out_cancel;
- err = nla_put(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft);
+ err = nla_put_64bit(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft,
+ XFRMA_PAD);
if (err)
goto out_cancel;
@@ -2617,7 +2619,7 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(x->props.extra_flags));
/* Must count x->lastused as it may become non-zero behind our back. */
- l += nla_total_size(sizeof(u64));
+ l += nla_total_size_64bit(sizeof(u64));
return l;
}